diff --git a/libs/libvpx/AUTHORS b/libs/libvpx/AUTHORS index fcd5c534a8..04c2872432 100644 --- a/libs/libvpx/AUTHORS +++ b/libs/libvpx/AUTHORS @@ -3,11 +3,13 @@ Aaron Watry Abo Talib Mahfoodh -Adam Xu Adrian Grange Aℓex Converse Ahmad Sharif +Aleksey Vasenev +Alexander Potapenko Alexander Voronov +Alexandra Hájková Alexis Ballier Alok Ahuja Alpha Lam @@ -15,6 +17,7 @@ A.Mahfoodh Ami Fischman Andoni Morales Alastruey Andres Mejia +Andrew Lewis Andrew Russell Angie Chiang Aron Rosenberg @@ -22,11 +25,14 @@ Attila Nagy Brion Vibber changjun.yang Charles 'Buck' Krasic +Cheng Chen chm +Chris Cunningham Christian Duvivier Daniele Castagna Daniel Kang Deb Mukherjee +Deepa K G Dim Temp Dmitry Kovalev Dragan Mrdjan @@ -37,17 +43,21 @@ Fabio Pedretti Frank Galligan Fredrik Söderquist Fritz Koenig +Gabriel Marin Gaute Strokkenes Geza Lore Ghislain MARY Giuseppe Scrivano Gordana Cmiljanovic +Gregor Jasny Guillaume Martres Guillermo Ballester Valor Hangyu Kuang Hanno Böck +Han Shen Henrik Lundin Hui Su +Ivan Krasin Ivan Maltz Jacek Caban Jacky Chen @@ -61,6 +71,7 @@ Jean-Yves Avenard Jeff Faust Jeff Muizelaar Jeff Petkau +Jerome Jiang Jia Jia Jian Zhou Jim Bankoski @@ -75,7 +86,9 @@ Joshua Litt Julia Robson Justin Clift Justin Lebar +Kaustubh Raste KO Myung-Hun +Kyle Siefring Lawrence Velázquez Linfeng Zhang Lou Quillio @@ -91,8 +104,12 @@ Michael Kohler Mike Frysinger Mike Hommey Mikhal Shemer +Min Chen Minghai Shang +Min Ye +Moriyoshi Koizumi Morton Jonuschat +Nathan E. Egge Nico Weber Parag Salasakar Pascal Massimino @@ -101,16 +118,22 @@ Paul Wilkins Pavol Rusnak Paweł Hajdan Pengchong Jin +Peter Boström +Peter Collingbourne Peter de Rivaz Philip Jägenstedt Priit Laes Rafael Ávila de Espíndola Rafaël Carré +Rafael de Lucena Valle +Rahul Chaudhry Ralph Giles +Ranjit Kumar Tulabandu Rob Bradford Ronald S. Bultje Rui Ueyama Sami Pietilä +Sarah Parker Sasi Inguva Scott Graham Scott LaVarnway @@ -118,9 +141,11 @@ Sean McGovern Sergey Kolomenkin Sergey Ulanov Shimon Doodkin +Shiyou Yin Shunyao Li Stefan Holmer Suman Sunkara +Sylvestre Ledru Taekhyun Kim Takanori MATSUURA Tamar Levy @@ -130,7 +155,10 @@ Thijs Vermeir Tim Kopp Timothy B. Terriberry Tom Finegan +Tristan Matthews +Urvang Joshi Vignesh Venkatasubramanian +Vlad Tsyrklevich Yaowu Xu Yi Luo Yongzhe Wang diff --git a/libs/libvpx/CHANGELOG b/libs/libvpx/CHANGELOG index 795d395f96..2281394c8e 100644 --- a/libs/libvpx/CHANGELOG +++ b/libs/libvpx/CHANGELOG @@ -1,3 +1,44 @@ +2017-01-04 v1.7.0 "Mandarin Duck" + This release focused on high bit depth performance (10/12 bit) and vp9 + encoding improvements. + + - Upgrading: + This release is ABI incompatible due to new vp9 encoder features. + + Frame parallel decoding for vp9 has been removed. + + - Enhancements: + vp9 encoding supports additional threads with --row-mt. This can be greater + than the number of tiles. + + Two new vp9 encoder options have been added: + --corpus-complexity + --tune-content=film + + Additional tooling for respecting the vp9 "level" profiles has been added. + + - Bug fixes: + A variety of fuzzing issues. + vp8 threading fix for ARM. + Codec control VP9_SET_SKIP_LOOP_FILTER fixed. + Reject invalid multi resolution configurations. + +2017-01-09 v1.6.1 "Long Tailed Duck" + This release improves upon the VP9 encoder and speeds up the encoding and + decoding processes. + + - Upgrading: + This release is ABI compatible with 1.6.0. + + - Enhancements: + Faster VP9 encoding and decoding. + High bit depth builds now provide similar speed for 8 bit encode and decode + for x86 targets. Other platforms and higher bit depth improvements are in + progress. + + - Bug Fixes: + A variety of fuzzing issues. + 2016-07-20 v1.6.0 "Khaki Campbell Duck" This release improves upon the VP9 encoder and speeds up the encoding and decoding processes. diff --git a/libs/libvpx/README b/libs/libvpx/README index b9266ca267..73304dd62f 100644 --- a/libs/libvpx/README +++ b/libs/libvpx/README @@ -1,4 +1,4 @@ -README - 20 July 2016 +README - 24 January 2018 Welcome to the WebM VP8/VP9 Codec SDK! @@ -47,6 +47,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: --help output of the configure script. As of this writing, the list of available targets is: + arm64-android-gcc arm64-darwin-gcc arm64-linux-gcc armv7-android-gcc @@ -57,10 +58,13 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv7-win32-vs11 armv7-win32-vs12 armv7-win32-vs14 + armv7-win32-vs15 armv7s-darwin-gcc armv8-linux-gcc mips32-linux-gcc mips64-linux-gcc + ppc64-linux-gcc + ppc64le-linux-gcc sparc-solaris-gcc x86-android-gcc x86-darwin8-gcc @@ -73,6 +77,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86-darwin13-gcc x86-darwin14-gcc x86-darwin15-gcc + x86-darwin16-gcc x86-iphonesimulator-gcc x86-linux-gcc x86-linux-icc @@ -83,6 +88,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86-win32-vs11 x86-win32-vs12 x86-win32-vs14 + x86-win32-vs15 x86_64-android-gcc x86_64-darwin9-gcc x86_64-darwin10-gcc @@ -91,6 +97,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin13-gcc x86_64-darwin14-gcc x86_64-darwin15-gcc + x86_64-darwin16-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc @@ -100,6 +107,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-win64-vs11 x86_64-win64-vs12 x86_64-win64-vs14 + x86_64-win64-vs15 generic-gnu The generic-gnu target, in conjunction with the CROSS environment variable, diff --git a/libs/libvpx/args.c b/libs/libvpx/args.c index bd1ede0388..a87b138b9d 100644 --- a/libs/libvpx/args.c +++ b/libs/libvpx/args.c @@ -13,6 +13,7 @@ #include #include "args.h" +#include "vpx/vpx_integer.h" #include "vpx_ports/msvc.h" #if defined(__GNUC__) && __GNUC__ @@ -118,13 +119,13 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { } unsigned int arg_parse_uint(const struct arg *arg) { - long int rawval; + uint32_t rawval; char *endptr; - rawval = strtol(arg->val, &endptr, 10); + rawval = (uint32_t)strtoul(arg->val, &endptr, 10); if (arg->val[0] != '\0' && endptr[0] == '\0') { - if (rawval >= 0 && rawval <= UINT_MAX) return (unsigned int)rawval; + if (rawval <= UINT_MAX) return rawval; die("Option %s: Value %ld out of range for unsigned int\n", arg->name, rawval); @@ -135,10 +136,10 @@ unsigned int arg_parse_uint(const struct arg *arg) { } int arg_parse_int(const struct arg *arg) { - long int rawval; + int32_t rawval; char *endptr; - rawval = strtol(arg->val, &endptr, 10); + rawval = (int32_t)strtol(arg->val, &endptr, 10); if (arg->val[0] != '\0' && endptr[0] == '\0') { if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval; diff --git a/libs/libvpx/build/make/Android.mk b/libs/libvpx/build/make/Android.mk index 36120170e8..a88f90056e 100644 --- a/libs/libvpx/build/make/Android.mk +++ b/libs/libvpx/build/make/Android.mk @@ -64,6 +64,9 @@ CONFIG_DIR := $(LOCAL_PATH)/ LIBVPX_PATH := $(LOCAL_PATH)/libvpx ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL) +ifneq ($(V),1) + qexec := @ +endif # Use the makefiles generated by upstream configure to determine which files to # build. Also set any architecture-specific flags. @@ -71,7 +74,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) include $(CONFIG_DIR)libs-armv7-android-gcc.mk LOCAL_ARM_MODE := arm else ifeq ($(TARGET_ARCH_ABI),arm64-v8a) - include $(CONFIG_DIR)libs-armv8-android-gcc.mk + include $(CONFIG_DIR)libs-arm64-android-gcc.mk LOCAL_ARM_MODE := arm else ifeq ($(TARGET_ARCH_ABI),x86) include $(CONFIG_DIR)libs-x86-android-gcc.mk @@ -101,10 +104,10 @@ LOCAL_CFLAGS := -O3 # like x86inc.asm and x86_abi_support.asm LOCAL_ASMFLAGS := -I$(LIBVPX_PATH) -.PRECIOUS: %.asm.s -$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm - @mkdir -p $(dir $@) - @$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@ +.PRECIOUS: %.asm.S +$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm + $(qexec)mkdir -p $(dir $@) + $(qexec)$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@ # For building *_rtcd.h, which have rules in libs.mk TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN))) @@ -132,7 +135,7 @@ endif # Pull out assembly files, splitting NEON from the rest. This is # done to specify that the NEON assembly files use NEON assembler flags. -# x86 assembly matches %.asm, arm matches %.asm.s +# x86 assembly matches %.asm, arm matches %.asm.S # x86: @@ -140,31 +143,44 @@ CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE)) LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file)) # arm: -CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE)) +CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.S, $(CODEC_SRCS_UNIQUE)) CODEC_SRCS_ASM_ARM = $(foreach v, \ $(CODEC_SRCS_ASM_ARM_ALL), \ $(if $(findstring neon,$(v)),,$(v))) -CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \ - $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \ +CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \ + $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ $(CODEC_SRCS_ASM_ARM)) LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS) ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) + ASM_INCLUDES := vpx_dsp/arm/idct_neon.asm.S CODEC_SRCS_ASM_NEON = $(foreach v, \ $(CODEC_SRCS_ASM_ARM_ALL),\ $(if $(findstring neon,$(v)),$(v),)) - CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \ - $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \ + CODEC_SRCS_ASM_NEON := $(filter-out $(addprefix %, $(ASM_INCLUDES)), \ + $(CODEC_SRCS_ASM_NEON)) + CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \ + $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ $(CODEC_SRCS_ASM_NEON)) - LOCAL_SRC_FILES += $(patsubst %.s, \ - %.s.neon, \ + LOCAL_SRC_FILES += $(patsubst %.S, \ + %.S.neon, \ $(CODEC_SRCS_ASM_NEON_ADS2GAS)) + + NEON_ASM_TARGETS = $(patsubst %.S, \ + $(ASM_CNV_PATH)/libvpx/%.S, \ + $(CODEC_SRCS_ASM_NEON)) +# add a dependency to the full path to the ads2gas output to ensure the +# includes are converted first. +ifneq ($(strip $(NEON_ASM_TARGETS)),) +$(NEON_ASM_TARGETS): $(addprefix $(ASM_CNV_PATH)/libvpx/, $(ASM_INCLUDES)) +endif endif LOCAL_CFLAGS += \ -DHAVE_CONFIG_H=vpx_config.h \ -I$(LIBVPX_PATH) \ - -I$(ASM_CNV_PATH) + -I$(ASM_CNV_PATH) \ + -I$(ASM_CNV_PATH)/libvpx LOCAL_MODULE := libvpx @@ -185,7 +201,8 @@ endif $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h -ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),) +rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a +ifneq ($$(findstring $(TARGET_ARCH_ABI),$$(rtcd_dep_template_CONFIG_ASM_ABIS)),) $$(rtcd_dep_template_SRCS): vpx_config.asm endif endef @@ -195,16 +212,17 @@ $(eval $(call rtcd_dep_template)) .PHONY: clean clean: @echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]" - @$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS) - @$(RM) -r $(ASM_CNV_PATH) - @$(RM) $(CLEAN-OBJS) + $(qexec)$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS) + $(qexec)$(RM) -r $(ASM_CNV_PATH) + $(qexec)$(RM) $(CLEAN-OBJS) ifeq ($(ENABLE_SHARED),1) + LOCAL_CFLAGS += -fPIC include $(BUILD_SHARED_LIBRARY) else include $(BUILD_STATIC_LIBRARY) endif ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) -$(call import-module,cpufeatures) +$(call import-module,android/cpufeatures) endif diff --git a/libs/libvpx/build/make/Makefile b/libs/libvpx/build/make/Makefile index 469eb74c3a..f6b3f0630f 100644 --- a/libs/libvpx/build/make/Makefile +++ b/libs/libvpx/build/make/Makefile @@ -90,7 +90,7 @@ all: .PHONY: clean clean:: - rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s) + rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.S.o=.asm.S) rm -f $(CLEAN-OBJS) .PHONY: clean @@ -124,6 +124,7 @@ ifeq ($(TOOLCHAIN), x86-os2-gcc) CFLAGS += -mstackrealign endif +# x86[_64] $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx $(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 @@ -138,6 +139,12 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 +$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl +$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl + +# POWER +$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx +$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx $(BUILD_PFX)%.c.d: %.c $(if $(quiet),@echo " [DEP] $@") @@ -180,13 +187,13 @@ $(BUILD_PFX)%.asm.o: %.asm $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(AS) $(ASFLAGS) -o $@ $< -$(BUILD_PFX)%.s.d: %.s +$(BUILD_PFX)%.S.d: %.S $(if $(quiet),@echo " [DEP] $@") $(qexec)mkdir -p $(dir $@) $(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \ --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@ -$(BUILD_PFX)%.s.o: %.s +$(BUILD_PFX)%.S.o: %.S $(if $(quiet),@echo " [AS] $@") $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(AS) $(ASFLAGS) -o $@ $< @@ -198,8 +205,8 @@ $(BUILD_PFX)%.c.S: %.c $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(CC) -S $(CFLAGS) -o $@ $< -.PRECIOUS: %.asm.s -$(BUILD_PFX)%.asm.s: %.asm +.PRECIOUS: %.asm.S +$(BUILD_PFX)%.asm.S: %.asm $(if $(quiet),@echo " [ASM CONVERSION] $@") $(qexec)mkdir -p $(dir $@) $(qexec)$(ASM_CONVERSION) <$< >$@ diff --git a/libs/libvpx/build/make/ads2gas.pl b/libs/libvpx/build/make/ads2gas.pl index 7272424af2..029cc4a56f 100755 --- a/libs/libvpx/build/make/ads2gas.pl +++ b/libs/libvpx/build/make/ads2gas.pl @@ -138,14 +138,6 @@ while () s/DCD(.*)/.long $1/; s/DCB(.*)/.byte $1/; - # RN to .req - if (s/RN\s+([Rr]\d+|lr)/.req $1/) - { - print; - print "$comment_sub$comment\n" if defined $comment; - next; - } - # Make function visible to linker, and make additional symbol with # prepended underscore s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/; diff --git a/libs/libvpx/build/make/ads2gas_apple.pl b/libs/libvpx/build/make/ads2gas_apple.pl index 1a9e105ba8..e1ae7b4f87 100755 --- a/libs/libvpx/build/make/ads2gas_apple.pl +++ b/libs/libvpx/build/make/ads2gas_apple.pl @@ -120,18 +120,6 @@ while () s/DCD(.*)/.long $1/; s/DCB(.*)/.byte $1/; - # Build a hash of all the register - alias pairs. - if (s/(.*)RN(.*)/$1 .req $2/g) - { - $register_aliases{trim($1)} = trim($2); - next; - } - - while (($key, $value) = each(%register_aliases)) - { - s/\b$key\b/$value/g; - } - # Make function visible to linker, and make additional symbol with # prepended underscore s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/; diff --git a/libs/libvpx/build/make/configure.sh b/libs/libvpx/build/make/configure.sh index 745e10ddc8..683b430374 100644 --- a/libs/libvpx/build/make/configure.sh +++ b/libs/libvpx/build/make/configure.sh @@ -332,20 +332,10 @@ EOF } check_cflags() { - log check_cflags "$@" - - case "$CC" in - *gcc*|*clang) - check_cc -Werror "$@" <> config.mk @@ -645,7 +652,7 @@ setup_gnu_toolchain() { AS=${AS:-${CROSS}as} STRIP=${STRIP:-${CROSS}strip} NM=${NM:-${CROSS}nm} - AS_SFX=.s + AS_SFX=.S EXE_SFX= } @@ -684,7 +691,6 @@ check_xcode_minimum_version() { process_common_toolchain() { if [ -z "$toolchain" ]; then gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}" - # detect tgt_isa case "$gcctarget" in aarch64*) @@ -707,6 +713,18 @@ process_common_toolchain() { *sparc*) tgt_isa=sparc ;; + power*64*-*) + tgt_isa=ppc64 + ;; + power*) + tgt_isa=ppc + ;; + *mips64el*) + tgt_isa=mips64 + ;; + *mips32el*) + tgt_isa=mips32 + ;; esac # detect tgt_os @@ -735,9 +753,16 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=darwin15 ;; + *darwin16*) + tgt_isa=x86_64 + tgt_os=darwin16 + ;; x86_64*mingw32*) tgt_os=win64 ;; + x86_64*cygwin*) + tgt_os=win64 + ;; *mingw32*|*cygwin*) [ -z "$tgt_isa" ] && tgt_isa=x86 tgt_os=win32 @@ -785,6 +810,9 @@ process_common_toolchain() { mips*) enable_feature mips ;; + ppc*) + enable_feature ppc + ;; esac # PIC is probably what we want when building shared libs @@ -853,6 +881,10 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.11" add_ldflags "-mmacosx-version-min=10.11" ;; + *-darwin16-*) + add_cflags "-mmacosx-version-min=10.12" + add_ldflags "-mmacosx-version-min=10.12" + ;; *-iphonesimulator-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}" @@ -936,7 +968,7 @@ EOF ;; vs*) asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl" - AS_SFX=.s + AS_SFX=.S msvs_arch_dir=arm-msvs disable_feature multithread disable_feature unit_tests @@ -946,6 +978,7 @@ EOF # only "AppContainerApplication" which requires an AppxManifest. # Therefore disable the examples, just build the library. disable_feature examples + disable_feature tools fi ;; rvct) @@ -1044,7 +1077,7 @@ EOF STRIP="$(${XCRUN_FIND} strip)" NM="$(${XCRUN_FIND} nm)" RANLIB="$(${XCRUN_FIND} ranlib)" - AS_SFX=.s + AS_SFX=.S LD="${CXX:-$(${XCRUN_FIND} ld)}" # ASFLAGS is written here instead of using check_add_asflags @@ -1153,10 +1186,20 @@ EOF fi fi + if enabled mmi; then + tgt_isa=loongson3a + check_add_ldflags -march=loongson3a + fi + check_add_cflags -march=${tgt_isa} check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC ;; + ppc*) + link_with_cc=gcc + setup_gnu_toolchain + check_gcc_machine_option "vsx" + ;; x86*) case ${tgt_os} in win*) @@ -1211,6 +1254,13 @@ EOF AS=msvs msvs_arch_dir=x86-msvs vc_version=${tgt_cc##vs} + case $vc_version in + 7|8|9|10|11|12|13|14) + echo "${tgt_cc} does not support avx512, disabling....." + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 " + soft_disable avx512 + ;; + esac case $vc_version in 7|8|9|10) echo "${tgt_cc} does not support avx/avx2, disabling....." @@ -1255,9 +1305,18 @@ EOF elif disabled $ext; then disable_exts="yes" else - # use the shortened version for the flag: sse4_1 -> sse4 - check_gcc_machine_option ${ext%_*} $ext + if [ "$ext" = "avx512" ]; then + check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl + else + # use the shortened version for the flag: sse4_1 -> sse4 + check_gcc_machine_option ${ext%_*} $ext + fi fi + + # https://bugs.chromium.org/p/webm/issues/detail?id=1464 + # The assembly optimizations for vpx_sub_pixel_variance do not link with + # gcc 6. + enabled sse2 && soft_enable pic done if enabled external_build; then @@ -1282,7 +1341,6 @@ EOF esac log_echo " using $AS" fi - [ "${AS##*/}" = nasm ] && add_asflags -Ox AS_SFX=.asm case ${tgt_os} in win32) @@ -1291,7 +1349,7 @@ EOF EXE_SFX=.exe ;; win64) - add_asflags -f x64 + add_asflags -f win64 enabled debug && add_asflags -g cv8 EXE_SFX=.exe ;; @@ -1425,6 +1483,10 @@ EOF echo "msa optimizations are available only for little endian platforms" disable_feature msa fi + if enabled mmi; then + echo "mmi optimizations are available only for little endian platforms" + disable_feature mmi + fi fi ;; esac diff --git a/libs/libvpx/build/make/gen_msvs_sln.sh b/libs/libvpx/build/make/gen_msvs_sln.sh index 7d5f468109..401223a0b5 100755 --- a/libs/libvpx/build/make/gen_msvs_sln.sh +++ b/libs/libvpx/build/make/gen_msvs_sln.sh @@ -25,7 +25,7 @@ files. Options: --help Print this message --out=outfile Redirect output to a file - --ver=version Version (7,8,9,10,11,12,14) of visual studio to generate for + --ver=version Version (7,8,9,10,11,12,14,15) of visual studio to generate for --target=isa-os-cc Target specifier EOF exit 1 @@ -215,7 +215,7 @@ for opt in "$@"; do ;; --ver=*) vs_ver="$optval" case $optval in - 10|11|12|14) + 10|11|12|14|15) ;; *) die Unrecognized Visual Studio Version in $opt ;; @@ -240,9 +240,12 @@ case "${vs_ver:-10}" in 12) sln_vers="12.00" sln_vers_str="Visual Studio 2013" ;; - 14) sln_vers="14.00" + 14) sln_vers="12.00" sln_vers_str="Visual Studio 2015" ;; + 15) sln_vers="12.00" + sln_vers_str="Visual Studio 2017" + ;; esac sfx=vcxproj diff --git a/libs/libvpx/build/make/gen_msvs_vcxproj.sh b/libs/libvpx/build/make/gen_msvs_vcxproj.sh index e98611d102..171d0b99b6 100755 --- a/libs/libvpx/build/make/gen_msvs_vcxproj.sh +++ b/libs/libvpx/build/make/gen_msvs_vcxproj.sh @@ -34,7 +34,7 @@ Options: --name=project_name Name of the project (required) --proj-guid=GUID GUID to use for the project --module-def=filename File containing export definitions (for DLLs) - --ver=version Version (10,11,12,14) of visual studio to generate for + --ver=version Version (10,11,12,14,15) of visual studio to generate for --src-path-bare=dir Path to root of source tree -Ipath/to/include Additional include directories -DFLAG[=value] Preprocessor macros to define @@ -82,7 +82,7 @@ generate_filter() { | sed -e "s,$src_path_bare,," \ -e 's/^[\./]\+//g' -e 's,[:/ ],_,g') - if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then + if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $asm_use_custom_step; then # Avoid object file name collisions, i.e. vpx_config.c and # vpx_config.asm produce the same object file without # this additional suffix. @@ -168,7 +168,7 @@ for opt in "$@"; do --ver=*) vs_ver="$optval" case "$optval" in - 10|11|12|14) + 10|11|12|14|15) ;; *) die Unrecognized Visual Studio Version in $opt ;; @@ -203,7 +203,7 @@ for opt in "$@"; do # The paths in file_list are fixed outside of the loop. file_list[${#file_list[@]}]="$opt" case "$opt" in - *.asm|*.s) uses_asm=true + *.asm|*.[Ss]) uses_asm=true ;; esac ;; @@ -218,7 +218,7 @@ guid=${guid:-`generate_uuid`} asm_use_custom_step=false uses_asm=${uses_asm:-false} case "${vs_ver:-11}" in - 10|11|12|14) + 10|11|12|14|15) asm_use_custom_step=$uses_asm ;; esac @@ -347,6 +347,9 @@ generate_vcxproj() { if [ "$vs_ver" = "14" ]; then tag_content PlatformToolset v140 fi + if [ "$vs_ver" = "15" ]; then + tag_content PlatformToolset v141 + fi tag_content CharacterSet Unicode if [ "$config" = "Release" ]; then tag_content WholeProgramOptimization true @@ -452,7 +455,7 @@ generate_vcxproj() { done open_tag ItemGroup - generate_filter "Source Files" "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s" + generate_filter "Source Files" "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s;S" close_tag ItemGroup open_tag ItemGroup generate_filter "Header Files" "h;hm;inl;inc;xsd" diff --git a/libs/libvpx/build/make/iosbuild.sh b/libs/libvpx/build/make/iosbuild.sh index c703f22b0c..365a8c0130 100755 --- a/libs/libvpx/build/make/iosbuild.sh +++ b/libs/libvpx/build/make/iosbuild.sh @@ -35,8 +35,8 @@ ARM_TARGETS="arm64-darwin-gcc armv7s-darwin-gcc" SIM_TARGETS="x86-iphonesimulator-gcc x86_64-iphonesimulator-gcc" -OSX_TARGETS="x86-darwin15-gcc - x86_64-darwin15-gcc" +OSX_TARGETS="x86-darwin16-gcc + x86_64-darwin16-gcc" TARGETS="${ARM_TARGETS} ${SIM_TARGETS}" # Configures for the target specified by $1, and invokes make with the dist @@ -271,7 +271,7 @@ cat << EOF --help: Display this message and exit. --enable-shared: Build a dynamic framework for use on iOS 8 or later. --extra-configure-args : Extra args to pass when configuring libvpx. - --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86 + --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86 and x86_64. Allows linking to framework when builds target MacOSX instead of iOS. --preserve-build-output: Do not delete the build directory. diff --git a/libs/libvpx/build/make/rtcd.pl b/libs/libvpx/build/make/rtcd.pl index 9e746c46d0..68e92b52cc 100755 --- a/libs/libvpx/build/make/rtcd.pl +++ b/libs/libvpx/build/make/rtcd.pl @@ -1,4 +1,13 @@ #!/usr/bin/env perl +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## no strict 'refs'; use warnings; @@ -200,6 +209,7 @@ sub filter { sub common_top() { my $include_guard = uc($opts{sym})."_H_"; print < $$.tmp else cat<$$.tmp +// This file is generated. Do not edit. #define VERSION_MAJOR $major_version #define VERSION_MINOR $minor_version #define VERSION_PATCH $patch_version diff --git a/libs/libvpx/configure b/libs/libvpx/configure index 95af2e395e..e5a74c6f2a 100755 --- a/libs/libvpx/configure +++ b/libs/libvpx/configure @@ -22,6 +22,7 @@ show_help(){ Advanced options: ${toggle_libs} libraries ${toggle_examples} examples + ${toggle_tools} tools ${toggle_docs} documentation ${toggle_unit_tests} unit tests ${toggle_decode_perf_tests} build decoder perf tests with unit tests @@ -108,10 +109,13 @@ all_platforms="${all_platforms} armv7-none-rvct" #neon Cortex-A8 all_platforms="${all_platforms} armv7-win32-vs11" all_platforms="${all_platforms} armv7-win32-vs12" all_platforms="${all_platforms} armv7-win32-vs14" +all_platforms="${all_platforms} armv7-win32-vs15" all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} armv8-linux-gcc" all_platforms="${all_platforms} mips32-linux-gcc" all_platforms="${all_platforms} mips64-linux-gcc" +all_platforms="${all_platforms} ppc64-linux-gcc" +all_platforms="${all_platforms} ppc64le-linux-gcc" all_platforms="${all_platforms} sparc-solaris-gcc" all_platforms="${all_platforms} x86-android-gcc" all_platforms="${all_platforms} x86-darwin8-gcc" @@ -124,6 +128,7 @@ all_platforms="${all_platforms} x86-darwin12-gcc" all_platforms="${all_platforms} x86-darwin13-gcc" all_platforms="${all_platforms} x86-darwin14-gcc" all_platforms="${all_platforms} x86-darwin15-gcc" +all_platforms="${all_platforms} x86-darwin16-gcc" all_platforms="${all_platforms} x86-iphonesimulator-gcc" all_platforms="${all_platforms} x86-linux-gcc" all_platforms="${all_platforms} x86-linux-icc" @@ -134,6 +139,7 @@ all_platforms="${all_platforms} x86-win32-vs10" all_platforms="${all_platforms} x86-win32-vs11" all_platforms="${all_platforms} x86-win32-vs12" all_platforms="${all_platforms} x86-win32-vs14" +all_platforms="${all_platforms} x86-win32-vs15" all_platforms="${all_platforms} x86_64-android-gcc" all_platforms="${all_platforms} x86_64-darwin9-gcc" all_platforms="${all_platforms} x86_64-darwin10-gcc" @@ -142,6 +148,7 @@ all_platforms="${all_platforms} x86_64-darwin12-gcc" all_platforms="${all_platforms} x86_64-darwin13-gcc" all_platforms="${all_platforms} x86_64-darwin14-gcc" all_platforms="${all_platforms} x86_64-darwin15-gcc" +all_platforms="${all_platforms} x86_64-darwin16-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" @@ -151,22 +158,26 @@ all_platforms="${all_platforms} x86_64-win64-vs10" all_platforms="${all_platforms} x86_64-win64-vs11" all_platforms="${all_platforms} x86_64-win64-vs12" all_platforms="${all_platforms} x86_64-win64-vs14" +all_platforms="${all_platforms} x86_64-win64-vs15" all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured # note that these should be in dependency order for now. -all_targets="libs examples docs" +all_targets="libs examples tools docs" # all targets available are enabled, by default. for t in ${all_targets}; do [ -f "${source_path}/${t}.mk" ] && enable_feature ${t} done +if ! diff --version >/dev/null; then + die "diff missing: Try installing diffutils via your package manager." +fi + if ! perl --version >/dev/null; then die "Perl is required to build" fi - if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then # test to see if source_path already configured if [ -f "${source_path}/vpx_config.h" ]; then @@ -222,6 +233,7 @@ ARCH_LIST=" mips x86 x86_64 + ppc " ARCH_EXT_LIST_X86=" mmx @@ -232,7 +244,13 @@ ARCH_EXT_LIST_X86=" sse4_1 avx avx2 + avx512 " + +ARCH_EXT_LIST_LOONGSON=" + mmi +" + ARCH_EXT_LIST=" neon neon_asm @@ -243,6 +261,10 @@ ARCH_EXT_LIST=" mips64 ${ARCH_EXT_LIST_X86} + + vsx + + ${ARCH_EXT_LIST_LOONGSON} " HAVE_LIST=" ${ARCH_EXT_LIST} @@ -254,7 +276,6 @@ EXPERIMENT_LIST=" spatial_svc fp_mb_stats emulate_hardware - misc_fixes " CONFIG_LIST=" dependency_tracking @@ -309,6 +330,7 @@ CONFIG_LIST=" better_hw_compatibility experimental size_limit + always_adjust_bpm ${EXPERIMENT_LIST} " CMDLINE_SELECT=" @@ -331,6 +353,7 @@ CMDLINE_SELECT=" libs examples + tools docs libc as @@ -367,6 +390,7 @@ CMDLINE_SELECT=" better_hw_compatibility vp9_highbitdepth experimental + always_adjust_bpm " process_cmdline() { @@ -476,7 +500,7 @@ EOF # # Write makefiles for all enabled targets # - for tgt in libs examples docs solution; do + for tgt in libs examples tools docs solution; do tgt_fn="$tgt-$toolchain.mk" if enabled $tgt; then @@ -568,6 +592,7 @@ process_toolchain() { check_add_cflags -Wdeclaration-after-statement check_add_cflags -Wdisabled-optimization check_add_cflags -Wfloat-conversion + check_add_cflags -Wparentheses-equality check_add_cflags -Wpointer-arith check_add_cflags -Wtype-limits check_add_cflags -Wcast-qual @@ -575,12 +600,20 @@ process_toolchain() { check_add_cflags -Wimplicit-function-declaration check_add_cflags -Wuninitialized check_add_cflags -Wunused + # -Wextra has some tricky cases. Rather than fix them all now, get the + # flag for as many files as possible and fix the remaining issues + # piecemeal. + # https://bugs.chromium.org/p/webm/issues/detail?id=1069 + check_add_cflags -Wextra # check_add_cflags also adds to cxxflags. gtest does not do well with # -Wundef so add it explicitly to CFLAGS only. check_cflags -Wundef && add_cflags_only -Wundef if enabled mips || [ -z "${INLINE}" ]; then enabled extra_warnings || check_add_cflags -Wno-unused-function fi + # Avoid this warning for third_party C++ sources. Some reorganization + # would be needed to apply this only to test/*.cc. + check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 fi if enabled icc; then @@ -632,7 +665,7 @@ process_toolchain() { gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror" all_targets="${all_targets} solution" - INLINE="__forceinline" + INLINE="__inline" ;; esac diff --git a/libs/libvpx/examples.mk b/libs/libvpx/examples.mk index cc7fb1ddcf..38c4d75c51 100644 --- a/libs/libvpx/examples.mk +++ b/libs/libvpx/examples.mk @@ -76,6 +76,7 @@ vpxdec.SRCS += tools_common.c tools_common.h vpxdec.SRCS += y4menc.c y4menc.h ifeq ($(CONFIG_LIBYUV),yes) vpxdec.SRCS += $(LIBYUV_SRCS) + $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += -Wno-unused-parameter endif ifeq ($(CONFIG_WEBM_IO),yes) vpxdec.SRCS += $(LIBWEBM_COMMON_SRCS) diff --git a/libs/libvpx/examples/vp8_multi_resolution_encoder.c b/libs/libvpx/examples/vp8_multi_resolution_encoder.c index 65308a0bd0..b14b1ff397 100644 --- a/libs/libvpx/examples/vp8_multi_resolution_encoder.c +++ b/libs/libvpx/examples/vp8_multi_resolution_encoder.c @@ -151,7 +151,7 @@ static void write_ivf_frame_header(FILE *outfile, if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; pts = pkt->data.frame.pts; - mem_put_le32(header, pkt->data.frame.sz); + mem_put_le32(header, (int)pkt->data.frame.sz); mem_put_le32(header + 4, pts & 0xFFFFFFFF); mem_put_le32(header + 8, pts >> 32); @@ -190,7 +190,7 @@ static void set_temporal_layer_pattern(int num_temporal_layers, cfg->ts_layer_id[0] = 0; cfg->ts_layer_id[1] = 1; // Use 60/40 bit allocation as example. - cfg->ts_target_bitrate[0] = 0.6f * bitrate; + cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate); cfg->ts_target_bitrate[1] = bitrate; /* 0=L, 1=GF */ @@ -240,9 +240,9 @@ static void set_temporal_layer_pattern(int num_temporal_layers, cfg->ts_layer_id[1] = 2; cfg->ts_layer_id[2] = 1; cfg->ts_layer_id[3] = 2; - // Use 40/20/40 bit allocation as example. - cfg->ts_target_bitrate[0] = 0.4f * bitrate; - cfg->ts_target_bitrate[1] = 0.6f * bitrate; + // Use 45/20/35 bit allocation as example. + cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate); + cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate); cfg->ts_target_bitrate[2] = bitrate; /* 0=L, 1=GF, 2=ARF */ @@ -294,8 +294,8 @@ int main(int argc, char **argv) { vpx_codec_err_t res[NUM_ENCODERS]; int i; - long width; - long height; + int width; + int height; int length_frame; int frame_avail; int got_data; @@ -347,9 +347,9 @@ int main(int argc, char **argv) { printf("Using %s\n", vpx_codec_iface_name(interface)); - width = strtol(argv[1], NULL, 0); - height = strtol(argv[2], NULL, 0); - framerate = strtol(argv[3], NULL, 0); + width = (int)strtol(argv[1], NULL, 0); + height = (int)strtol(argv[2], NULL, 0); + framerate = (int)strtol(argv[3], NULL, 0); if (width < 16 || width % 2 || height < 16 || height % 2) die("Invalid resolution: %ldx%ld", width, height); @@ -371,12 +371,13 @@ int main(int argc, char **argv) { // Bitrates per spatial layer: overwrite default rates above. for (i = 0; i < NUM_ENCODERS; i++) { - target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0); + target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0); } // Temporal layers per spatial layers: overwrite default settings above. for (i = 0; i < NUM_ENCODERS; i++) { - num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0); + num_temporal_layers[i] = + (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0); if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3) die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n", num_temporal_layers); @@ -391,9 +392,9 @@ int main(int argc, char **argv) { downsampled_input[i] = fopen(filename, "wb"); } - key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0); + key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0); - show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0); + show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0); /* Populate default encoder configuration */ for (i = 0; i < NUM_ENCODERS; i++) { @@ -460,7 +461,7 @@ int main(int argc, char **argv) { // Set the number of threads per encode/spatial layer. // (1, 1, 1) means no encoder threading. - cfg[0].g_threads = 2; + cfg[0].g_threads = 1; cfg[1].g_threads = 1; cfg[2].g_threads = 1; @@ -469,7 +470,7 @@ int main(int argc, char **argv) { if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32)) die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); - if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w) + if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w) read_frame_p = read_frame; else read_frame_p = read_frame_by_row; @@ -507,9 +508,11 @@ int main(int argc, char **argv) { /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */ /* Enable denoising for the highest-resolution encoder. */ - if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 4)) + if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1)) die_codec(&codec[0], "Failed to set noise_sensitivity"); - for (i = 1; i < NUM_ENCODERS; i++) { + if (vpx_codec_control(&codec[1], VP8E_SET_NOISE_SENSITIVITY, 1)) + die_codec(&codec[1], "Failed to set noise_sensitivity"); + for (i = 2; i < NUM_ENCODERS; i++) { if (vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0)) die_codec(&codec[i], "Failed to set noise_sensitivity"); } @@ -556,7 +559,8 @@ int main(int argc, char **argv) { /* Write out down-sampled input. */ length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2; if (fwrite(raw[i].planes[0], 1, length_frame, - downsampled_input[NUM_ENCODERS - i - 1]) != length_frame) { + downsampled_input[NUM_ENCODERS - i - 1]) != + (unsigned int)length_frame) { return EXIT_FAILURE; } } @@ -617,10 +621,6 @@ int main(int argc, char **argv) { break; default: break; } - printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT && - (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY) - ? "K" - : ""); fflush(stdout); } } @@ -661,7 +661,6 @@ int main(int argc, char **argv) { write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1); fclose(outfile[i]); } - printf("\n"); return EXIT_SUCCESS; } diff --git a/libs/libvpx/examples/vp9_spatial_svc_encoder.c b/libs/libvpx/examples/vp9_spatial_svc_encoder.c index cecdce0804..0987cbfb85 100644 --- a/libs/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/libs/libvpx/examples/vp9_spatial_svc_encoder.c @@ -84,6 +84,8 @@ static const arg_def_t speed_arg = ARG_DEF("sp", "speed", 1, "speed configuration"); static const arg_def_t aqmode_arg = ARG_DEF("aq", "aqmode", 1, "aq-mode off/on"); +static const arg_def_t bitrates_arg = + ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]"); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -124,6 +126,7 @@ static const arg_def_t *svc_args[] = { &frames_arg, #endif &speed_arg, &rc_end_usage_arg, + &bitrates_arg, NULL }; static const uint32_t default_frames_to_skip = 0; @@ -165,7 +168,7 @@ void usage_exit(void) { static void parse_command_line(int argc, const char **argv_, AppInput *app_input, SvcContext *svc_ctx, vpx_codec_enc_cfg_t *enc_cfg) { - struct arg arg = { 0 }; + struct arg arg; char **argv = NULL; char **argi = NULL; char **argj = NULL; @@ -250,6 +253,9 @@ static void parse_command_line(int argc, const char **argv_, } else if (arg_match(&arg, &scale_factors_arg, argi)) { snprintf(string_options, sizeof(string_options), "%s scale-factors=%s", string_options, arg.val); + } else if (arg_match(&arg, &bitrates_arg, argi)) { + snprintf(string_options, sizeof(string_options), "%s bitrates=%s", + string_options, arg.val); } else if (arg_match(&arg, &passes_arg, argi)) { passes = arg_parse_uint(&arg); if (passes < 1 || passes > 2) { @@ -417,7 +423,6 @@ static void set_rate_control_stats(struct RateControlStats *rc, for (sl = 0; sl < cfg->ss_number_layers; ++sl) { for (tl = 0; tl < cfg->ts_number_layers; ++tl) { const int layer = sl * cfg->ts_number_layers + tl; - const int tlayer0 = sl * cfg->ts_number_layers; if (cfg->ts_number_layers == 1) rc->layer_framerate[layer] = framerate; else @@ -428,8 +433,8 @@ static void set_rate_control_stats(struct RateControlStats *rc, cfg->layer_target_bitrate[layer - 1]) / (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]); } else { - rc->layer_pfb[tlayer0] = 1000.0 * cfg->layer_target_bitrate[tlayer0] / - rc->layer_framerate[tlayer0]; + rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] / + rc->layer_framerate[layer]; } rc->layer_input_frames[layer] = 0; rc->layer_enc_frames[layer] = 0; @@ -449,12 +454,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc, vpx_codec_enc_cfg_t *cfg, int frame_cnt) { unsigned int sl, tl; - int tot_num_frames = 0; double perc_fluctuation = 0.0; + int tot_num_frames = 0; printf("Total number of processed frames: %d\n\n", frame_cnt - 1); printf("Rate control layer stats for sl%d tl%d layer(s):\n\n", cfg->ss_number_layers, cfg->ts_number_layers); for (sl = 0; sl < cfg->ss_number_layers; ++sl) { + tot_num_frames = 0; for (tl = 0; tl < cfg->ts_number_layers; ++tl) { const int layer = sl * cfg->ts_number_layers + tl; const int num_dropped = @@ -462,7 +468,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc, ? (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer]) : (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] - 1); - if (!sl) tot_num_frames += rc->layer_input_frames[layer]; + tot_num_frames += rc->layer_input_frames[layer]; rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] * rc->layer_encoding_bitrate[layer] / tot_num_frames; @@ -503,7 +509,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc, } vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, - uint32_t sizes[8], int *count) { + uint64_t sizes[8], int *count) { // A chunk ending with a byte matching 0xc0 is an invalid chunk unless // it is a super frame index. If the last byte of real video compression // data is 0xc0 the encoder must add a 0 byte. If we have the marker but @@ -600,9 +606,9 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers, } int main(int argc, const char **argv) { - AppInput app_input = { 0 }; + AppInput app_input; VpxVideoWriter *writer = NULL; - VpxVideoInfo info = { 0 }; + VpxVideoInfo info; vpx_codec_ctx_t codec; vpx_codec_enc_cfg_t enc_cfg; SvcContext svc_ctx; @@ -620,7 +626,7 @@ int main(int argc, const char **argv) { struct RateControlStats rc; vpx_svc_layer_id_t layer_id; vpx_svc_ref_frame_config_t ref_frame_config; - int sl, tl; + unsigned int sl, tl; double sum_bitrate = 0.0; double sum_bitrate2 = 0.0; double framerate = 30.0; @@ -634,8 +640,9 @@ int main(int argc, const char **argv) { // Allocate image buffer #if CONFIG_VP9_HIGHBITDEPTH - if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 - : VPX_IMG_FMT_I42016, + if (!vpx_img_alloc(&raw, + enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 + : VPX_IMG_FMT_I42016, enc_cfg.g_w, enc_cfg.g_h, 32)) { die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); } @@ -673,7 +680,7 @@ int main(int argc, const char **argv) { } #if OUTPUT_RC_STATS // For now, just write temporal layer streams. - // TODO(wonkap): do spatial by re-writing superframe. + // TODO(marpan): do spatial by re-writing superframe. if (svc_ctx.output_rc_stat) { for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) { char file_name[PATH_MAX]; @@ -691,10 +698,18 @@ int main(int argc, const char **argv) { if (svc_ctx.speed != -1) vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed); - if (svc_ctx.threads) + if (svc_ctx.threads) { vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1)); + if (svc_ctx.threads > 1) + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1); + else + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0); + } if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1) vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + if (svc_ctx.speed >= 5) + vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); // Encode frames while (!end_of_stream) { @@ -730,7 +745,7 @@ int main(int argc, const char **argv) { &ref_frame_config); // Keep track of input frames, to account for frame drops in rate control // stats/metrics. - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) { ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id]; } @@ -755,18 +770,20 @@ int main(int argc, const char **argv) { SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal; if (cx_pkt->data.frame.sz > 0) { #if OUTPUT_RC_STATS - uint32_t sizes[8]; + uint64_t sizes[8]; int count = 0; #endif vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, cx_pkt->data.frame.pts); #if OUTPUT_RC_STATS - // TODO(marpan/wonkap): Put this (to line728) in separate function. + // TODO(marpan): Put this (to line728) in separate function. if (svc_ctx.output_rc_stat) { vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id); parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, sizes, &count); + if (enc_cfg.ss_number_layers == 1) + sizes[0] = cx_pkt->data.frame.sz; // Note computing input_layer_frames here won't account for frame // drops in rate control stats. // TODO(marpan): Fix this for non-bypass mode so we can get stats @@ -793,7 +810,7 @@ int main(int argc, const char **argv) { rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; // Keep count of rate control stats per layer, for non-key // frames. - if (tl == layer_id.temporal_layer_id && + if (tl == (unsigned int)layer_id.temporal_layer_id && !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl]; rc.layer_avg_rate_mismatch[layer] += @@ -807,7 +824,7 @@ int main(int argc, const char **argv) { // Update for short-time encoding bitrate states, for moving // window of size rc->window, shifted by rc->window / 2. // Ignore first window segment, due to key frame. - if (frame_cnt > rc.window_size) { + if (frame_cnt > (unsigned int)rc.window_size) { tl = layer_id.temporal_layer_id; for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; @@ -823,13 +840,14 @@ int main(int argc, const char **argv) { } // Second shifted window. - if (frame_cnt > rc.window_size + rc.window_size / 2) { + if (frame_cnt > + (unsigned int)(rc.window_size + rc.window_size / 2)) { tl = layer_id.temporal_layer_id; for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; } - if (frame_cnt > 2 * rc.window_size && + if (frame_cnt > (unsigned int)(2 * rc.window_size) && frame_cnt % rc.window_size == 0) { rc.window_count += 1; rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; @@ -842,10 +860,11 @@ int main(int argc, const char **argv) { } #endif } - + /* printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY), (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts); + */ if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1) si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; ++frames_received; diff --git a/libs/libvpx/examples/vp9cx_set_ref.c b/libs/libvpx/examples/vp9cx_set_ref.c index 798d7e3f2b..3472689db2 100644 --- a/libs/libvpx/examples/vp9cx_set_ref.c +++ b/libs/libvpx/examples/vp9cx_set_ref.c @@ -304,6 +304,7 @@ int main(int argc, char **argv) { const char *height_arg = NULL; const char *infile_arg = NULL; const char *outfile_arg = NULL; + const char *update_frame_num_arg = NULL; unsigned int limit = 0; vp9_zero(ecodec); @@ -318,18 +319,21 @@ int main(int argc, char **argv) { height_arg = argv[2]; infile_arg = argv[3]; outfile_arg = argv[4]; + update_frame_num_arg = argv[5]; encoder = get_vpx_encoder_by_name("vp9"); if (!encoder) die("Unsupported codec."); - update_frame_num = atoi(argv[5]); + update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0); // In VP9, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are // allocated while calling vpx_codec_encode(), thus, setting reference for // 1st frame isn't supported. - if (update_frame_num <= 1) die("Couldn't parse frame number '%s'\n", argv[5]); + if (update_frame_num <= 1) { + die("Couldn't parse frame number '%s'\n", update_frame_num_arg); + } if (argc > 6) { - limit = atoi(argv[6]); + limit = (unsigned int)strtoul(argv[6], NULL, 0); if (update_frame_num > limit) die("Update frame number couldn't larger than limit\n"); } diff --git a/libs/libvpx/examples/vpx_temporal_svc_encoder.c b/libs/libvpx/examples/vpx_temporal_svc_encoder.c index 309a2fe2e2..f5736ea45d 100644 --- a/libs/libvpx/examples/vpx_temporal_svc_encoder.c +++ b/libs/libvpx/examples/vpx_temporal_svc_encoder.c @@ -26,17 +26,27 @@ #include "../tools_common.h" #include "../video_writer.h" +#define VP8_ROI_MAP 0 + static const char *exec_name; void usage_exit(void) { exit(EXIT_FAILURE); } -// Denoiser states, for temporal denoising. -enum denoiserState { - kDenoiserOff, - kDenoiserOnYOnly, - kDenoiserOnYUV, - kDenoiserOnYUVAggressive, - kDenoiserOnAdaptive +// Denoiser states for vp8, for temporal denoising. +enum denoiserStateVp8 { + kVp8DenoiserOff, + kVp8DenoiserOnYOnly, + kVp8DenoiserOnYUV, + kVp8DenoiserOnYUVAggressive, + kVp8DenoiserOnAdaptive +}; + +// Denoiser states for vp9, for temporal denoising. +enum denoiserStateVp9 { + kVp9DenoiserOff, + kVp9DenoiserOnYOnly, + // For SVC: denoise the top two spatial layers. + kVp9DenoiserOnYTwoSpatialLayers }; static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 }; @@ -154,6 +164,53 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc, die("Error: Number of input frames not equal to output! \n"); } +#if VP8_ROI_MAP +static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) { + unsigned int i, j; + memset(roi, 0, sizeof(*roi)); + + // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for + // segment is 16x16 for vp8, 8x8 for vp9. + roi->rows = (cfg->g_h + 15) / 16; + roi->cols = (cfg->g_w + 15) / 16; + + // Applies delta QP on the segment blocks, varies from -63 to 63. + // Setting to negative means lower QP (better quality). + // Below we set delta_q to the extreme (-63) to show strong effect. + roi->delta_q[0] = 0; + roi->delta_q[1] = -63; + roi->delta_q[2] = 0; + roi->delta_q[3] = 0; + + // Applies delta loopfilter strength on the segment blocks, varies from -63 to + // 63. Setting to positive means stronger loopfilter. + roi->delta_lf[0] = 0; + roi->delta_lf[1] = 0; + roi->delta_lf[2] = 0; + roi->delta_lf[3] = 0; + + // Applies skip encoding threshold on the segment blocks, varies from 0 to + // UINT_MAX. Larger value means more skipping of encoding is possible. + // This skip threshold only applies on delta frames. + roi->static_threshold[0] = 0; + roi->static_threshold[1] = 0; + roi->static_threshold[2] = 0; + roi->static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi->roi_map = + (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map)); + for (i = 0; i < roi->rows; ++i) { + for (j = 0; j < roi->cols; ++j) { + if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) && + j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) { + roi->roi_map[i * roi->cols + j] = 1; + } + } + } +} +#endif + // Temporal scaling parameters: // NOTE: The 3 prediction frames cannot be used interchangeably due to // differences in the way they are handled throughout the code. The @@ -495,6 +552,7 @@ int main(int argc, char **argv) { vpx_codec_err_t res; unsigned int width; unsigned int height; + uint32_t error_resilient = 0; int speed; int frame_avail; int got_data; @@ -505,16 +563,15 @@ int main(int argc, char **argv) { int layering_mode = 0; int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 }; int flag_periodicity = 1; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) - vpx_svc_layer_id_t layer_id = { 0, 0 }; -#else - vpx_svc_layer_id_t layer_id = { 0 }; +#if VP8_ROI_MAP + vpx_roi_map_t roi; #endif + vpx_svc_layer_id_t layer_id = { 0, 0 }; const VpxInterface *encoder = NULL; FILE *infile = NULL; struct RateControlMetrics rc; int64_t cx_time = 0; - const int min_args_base = 11; + const int min_args_base = 13; #if CONFIG_VP9_HIGHBITDEPTH vpx_bit_depth_t bit_depth = VPX_BITS_8; int input_bit_depth = 8; @@ -531,12 +588,14 @@ int main(int argc, char **argv) { if (argc < min_args) { #if CONFIG_VP9_HIGHBITDEPTH die("Usage: %s " - " " + " " + " " " ... \n", argv[0]); #else die("Usage: %s " - " " + " " + " " " ... \n", argv[0]); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -553,9 +612,9 @@ int main(int argc, char **argv) { die("Invalid resolution: %d x %d", width, height); } - layering_mode = (int)strtol(argv[10], NULL, 0); + layering_mode = (int)strtol(argv[12], NULL, 0); if (layering_mode < 0 || layering_mode > 13) { - die("Invalid layering mode (0..12) %s", argv[10]); + die("Invalid layering mode (0..12) %s", argv[12]); } if (argc != min_args + mode_to_num_layers[layering_mode]) { @@ -619,11 +678,11 @@ int main(int argc, char **argv) { for (i = min_args_base; (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) { - rc.layer_target_bitrate[i - 11] = (int)strtol(argv[i], NULL, 0); + rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0); if (strncmp(encoder->name, "vp8", 3) == 0) - cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11]; + cfg.ts_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13]; else if (strncmp(encoder->name, "vp9", 3) == 0) - cfg.layer_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11]; + cfg.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13]; } // Real time parameters. @@ -634,7 +693,7 @@ int main(int argc, char **argv) { if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52; cfg.rc_undershoot_pct = 50; cfg.rc_overshoot_pct = 50; - cfg.rc_buf_initial_sz = 500; + cfg.rc_buf_initial_sz = 600; cfg.rc_buf_optimal_sz = 600; cfg.rc_buf_sz = 1000; @@ -642,10 +701,14 @@ int main(int argc, char **argv) { cfg.rc_resize_allowed = 0; // Use 1 thread as default. - cfg.g_threads = 1; + cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0); + error_resilient = (uint32_t)strtoul(argv[10], NULL, 0); + if (error_resilient != 0 && error_resilient != 1) { + die("Invalid value for error resilient (0, 1): %d.", error_resilient); + } // Enable error resilient mode. - cfg.g_error_resilient = 1; + cfg.g_error_resilient = error_resilient; cfg.g_lag_in_frames = 0; cfg.kf_mode = VPX_KF_AUTO; @@ -700,18 +763,33 @@ int main(int argc, char **argv) { if (strncmp(encoder->name, "vp8", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); - vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff); + vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0); +#if VP8_ROI_MAP + vp8_set_roi_map(&cfg, &roi); + if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); +#endif + } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_svc_extra_cfg_t svc_params; memset(&svc_params, 0, sizeof(svc_params)); vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0); + vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0); vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0); - vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff); + vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1)); + // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at + // high speed settings, disable its use for those cases for now. + if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7)) + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1); + else + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0); if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1 : 0)) die_codec(&codec, "Failed to set SVC"); for (i = 0; i < cfg.ts_number_layers; ++i) { @@ -730,7 +808,7 @@ int main(int argc, char **argv) { // For generating smaller key frames, use a smaller max_intra_size_pct // value, like 100 or 200. { - const int max_intra_size_pct = 900; + const int max_intra_size_pct = 1000; vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct); } @@ -740,10 +818,8 @@ int main(int argc, char **argv) { struct vpx_usec_timer timer; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) // Update the temporal layer_id. No spatial layers in this test. layer_id.spatial_layer_id = 0; -#endif layer_id.temporal_layer_id = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity]; if (strncmp(encoder->name, "vp9", 3) == 0) { diff --git a/libs/libvpx/libs.mk b/libs/libvpx/libs.mk index 97ed93bb70..a3e2f9d0eb 100644 --- a/libs/libvpx/libs.mk +++ b/libs/libvpx/libs.mk @@ -12,7 +12,7 @@ # ARM assembly files are written in RVCT-style. We use some make magic to # filter those files to allow GCC compilation ifeq ($(ARCH_ARM),yes) - ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm) + ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm) else ASM:=.asm endif @@ -106,9 +106,6 @@ ifeq ($(CONFIG_VP9_DECODER),yes) CODEC_DOC_SECTIONS += vp9 vp9_decoder endif -VP9_PREFIX=vp9/ -$(BUILD_PFX)$(VP9_PREFIX)%.c.o: - ifeq ($(CONFIG_ENCODERS),yes) CODEC_DOC_SECTIONS += encoder endif @@ -116,6 +113,12 @@ ifeq ($(CONFIG_DECODERS),yes) CODEC_DOC_SECTIONS += decoder endif +# Suppress -Wextra warnings in third party code. +$(BUILD_PFX)third_party/googletest/%.cc.o: CXXFLAGS += -Wno-missing-field-initializers +# Suppress -Wextra warnings in first party code pending investigation. +# https://bugs.chromium.org/p/webm/issues/detail?id=1069 +$(BUILD_PFX)vp8/encoder/onyx_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered +$(BUILD_PFX)vp8/decoder/onyxd_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered ifeq ($(CONFIG_MSVS),yes) CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd) @@ -146,6 +149,7 @@ CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c ifeq ($(ARCH_X86)$(ARCH_X86_64),yes) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm endif CODEC_EXPORTS-yes += vpx/exports_com CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc @@ -184,6 +188,13 @@ libvpx_srcs.txt: @echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ CLEAN-OBJS += libvpx_srcs.txt +# Assembly files that are included, but don't define symbols themselves. +# Filtered out to avoid Windows build warnings. +ASM_INCLUDES := \ + third_party/x86inc/x86inc.asm \ + vpx_config.asm \ + vpx_ports/x86_abi_support.asm \ + vpx_dsp/x86/bitdepth_conversion_sse2.asm \ ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) @@ -195,13 +206,6 @@ vpx.def: $(call enabled,CODEC_EXPORTS) --out=$@ $^ CLEAN-OBJS += vpx.def -# Assembly files that are included, but don't define symbols themselves. -# Filtered out to avoid Visual Studio build warnings. -ASM_INCLUDES := \ - third_party/x86inc/x86inc.asm \ - vpx_config.asm \ - vpx_ports/x86_abi_support.asm \ - vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def @echo " [CREATE] $@" $(qexec)$(GEN_VCPROJ) \ @@ -224,12 +228,12 @@ vpx.$(VCPROJ_SFX): $(RTCD) endif else -LIBVPX_OBJS=$(call objs,$(CODEC_SRCS)) +LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS))) OBJS-yes += $(LIBVPX_OBJS) LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) -SO_VERSION_MAJOR := 4 +SO_VERSION_MAJOR := 5 SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) @@ -363,7 +367,7 @@ endif # # Add assembler dependencies for configuration. # -$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm +$(filter %.S.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm @@ -388,7 +392,7 @@ LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS)) LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX) LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\ $(call enabled,LIBVPX_TEST_DATA)) -libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1) +libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1) TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX) TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) @@ -401,8 +405,16 @@ CLEAN-OBJS += libvpx_test_srcs.txt $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 @echo " [DOWNLOAD] $@" - $(qexec)trap 'rm -f $@' INT TERM &&\ - curl -L -o $@ $(call libvpx_test_data_url,$(@F)) + # Attempt to download the file using curl, retrying once if it fails for a + # partial file (18). + $(qexec)( \ + trap 'rm -f $@' INT TERM; \ + curl="curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \ + $$curl; \ + case "$$?" in \ + 18) $$curl -C -;; \ + esac \ + ) testdata:: $(LIBVPX_TEST_DATA) $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ diff --git a/libs/libvpx/rate_hist.c b/libs/libvpx/rate_hist.c index 872a10bae0..6cf8ce7bb0 100644 --- a/libs/libvpx/rate_hist.c +++ b/libs/libvpx/rate_hist.c @@ -37,7 +37,13 @@ struct rate_hist { struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg, const vpx_rational_t *fps) { int i; - struct rate_hist *hist = malloc(sizeof(*hist)); + struct rate_hist *hist = calloc(1, sizeof(*hist)); + + if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 || + fps->den == 0) { + destroy_rate_histogram(hist); + return NULL; + } // Determine the number of samples in the buffer. Use the file's framerate // to determine the number of frames in rc_buf_sz milliseconds, with an @@ -80,7 +86,11 @@ void update_rate_histogram(struct rate_hist *hist, (uint64_t)cfg->g_timebase.num / (uint64_t)cfg->g_timebase.den; - int idx = hist->frames++ % hist->samples; + int idx; + + if (hist == NULL || cfg == NULL || pkt == NULL) return; + + idx = hist->frames++ % hist->samples; hist->pts[idx] = now; hist->sz[idx] = (int)pkt->data.frame.sz; @@ -116,9 +126,14 @@ void update_rate_histogram(struct rate_hist *hist, static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets, int *num_buckets) { int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0; - int buckets = *num_buckets; + int buckets; int i; + assert(bucket != NULL); + assert(num_buckets != NULL); + + buckets = *num_buckets; + /* Find the extrema for this list of buckets */ big_bucket = small_bucket = 0; for (i = 0; i < buckets; i++) { @@ -181,6 +196,8 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets, const char *pat1, *pat2; int i; + assert(bucket != NULL); + switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) { case 1: case 2: @@ -259,6 +276,8 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg, int i, scale; int buckets = 0; + if (hist == NULL || cfg == NULL) return; + for (i = 0; i < RATE_BINS; i++) { if (hist->bucket[i].low == INT_MAX) continue; hist->bucket[buckets++] = hist->bucket[i]; diff --git a/libs/libvpx/test/acm_random.h b/libs/libvpx/test/acm_random.h index c2f6b0e410..d915cf9133 100644 --- a/libs/libvpx/test/acm_random.h +++ b/libs/libvpx/test/acm_random.h @@ -11,6 +11,10 @@ #ifndef TEST_ACM_RANDOM_H_ #define TEST_ACM_RANDOM_H_ +#include + +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "vpx/vpx_integer.h" @@ -50,6 +54,13 @@ class ACMRandom { return r < 128 ? r << 4 : r >> 4; } + uint32_t RandRange(const uint32_t range) { + // testing::internal::Random::Generate provides values in the range + // testing::internal::Random::kMaxRange. + assert(range <= testing::internal::Random::kMaxRange); + return random_.Generate(range); + } + int PseudoUniform(int range) { return random_.Generate(range); } int operator()(int n) { return PseudoUniform(n); } diff --git a/libs/libvpx/test/android/Android.mk b/libs/libvpx/test/android/Android.mk index 48872a2b65..7318de2fc4 100644 --- a/libs/libvpx/test/android/Android.mk +++ b/libs/libvpx/test/android/Android.mk @@ -32,6 +32,7 @@ LOCAL_CPP_EXTENSION := .cc LOCAL_MODULE := gtest LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/ LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/ +LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/ LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc include $(BUILD_STATIC_LIBRARY) diff --git a/libs/libvpx/test/avg_test.cc b/libs/libvpx/test/avg_test.cc index 867a77aa0d..ad21198e4b 100644 --- a/libs/libvpx/test/avg_test.cc +++ b/libs/libvpx/test/avg_test.cc @@ -14,6 +14,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -22,6 +23,7 @@ #include "test/register_state_check.h" #include "test/util.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -53,7 +55,7 @@ class AverageTestBase : public ::testing::Test { } // Sum Pixels - unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) { + static unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) { unsigned int average = 0; for (int h = 0; h < 8; ++h) { for (int w = 0; w < 8; ++w) average += source[h * pitch + w]; @@ -61,7 +63,7 @@ class AverageTestBase : public ::testing::Test { return ((average + 32) >> 6); } - unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) { + static unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) { unsigned int average = 0; for (int h = 0; h < 4; ++h) { for (int w = 0; w < 4; ++w) average += source[h * pitch + w]; @@ -98,11 +100,12 @@ class AverageTest : public AverageTestBase, protected: void CheckAverages() { + const int block_size = GET_PARAM(3); unsigned int expected = 0; - if (GET_PARAM(3) == 8) { + if (block_size == 8) { expected = ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_); - } else if (GET_PARAM(3) == 4) { + } else if (block_size == 4) { expected = ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_); } @@ -185,7 +188,7 @@ class IntProColTest : public AverageTestBase, int16_t sum_c_; }; -typedef int (*SatdFunc)(const int16_t *coeffs, int length); +typedef int (*SatdFunc)(const tran_low_t *coeffs, int length); typedef std::tr1::tuple SatdTestParam; class SatdTest : public ::testing::Test, @@ -195,7 +198,7 @@ class SatdTest : public ::testing::Test, satd_size_ = GET_PARAM(0); satd_func_ = GET_PARAM(1); rnd_.Reset(ACMRandom::DeterministicSeed()); - src_ = reinterpret_cast( + src_ = reinterpret_cast( vpx_memalign(16, sizeof(*src_) * satd_size_)); ASSERT_TRUE(src_ != NULL); } @@ -205,12 +208,15 @@ class SatdTest : public ::testing::Test, vpx_free(src_); } - void FillConstant(const int16_t val) { + void FillConstant(const tran_low_t val) { for (int i = 0; i < satd_size_; ++i) src_[i] = val; } void FillRandom() { - for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16(); + for (int i = 0; i < satd_size_; ++i) { + const int16_t tmp = rnd_.Rand16(); + src_[i] = (tran_low_t)tmp; + } } void Check(const int expected) { @@ -222,11 +228,66 @@ class SatdTest : public ::testing::Test, int satd_size_; private: - int16_t *src_; + tran_low_t *src_; SatdFunc satd_func_; ACMRandom rnd_; }; +typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size); +typedef std::tr1::tuple BlockErrorTestFPParam; + +class BlockErrorTestFP + : public ::testing::Test, + public ::testing::WithParamInterface { + protected: + virtual void SetUp() { + txfm_size_ = GET_PARAM(0); + block_error_func_ = GET_PARAM(1); + rnd_.Reset(ACMRandom::DeterministicSeed()); + coeff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*coeff_) * txfm_size_)); + dqcoeff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*dqcoeff_) * txfm_size_)); + ASSERT_TRUE(coeff_ != NULL); + ASSERT_TRUE(dqcoeff_ != NULL); + } + + virtual void TearDown() { + libvpx_test::ClearSystemState(); + vpx_free(coeff_); + vpx_free(dqcoeff_); + } + + void FillConstant(const tran_low_t coeff_val, const tran_low_t dqcoeff_val) { + for (int i = 0; i < txfm_size_; ++i) coeff_[i] = coeff_val; + for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = dqcoeff_val; + } + + void FillRandom() { + // Just two fixed seeds + rnd_.Reset(0xb0b9); + for (int i = 0; i < txfm_size_; ++i) coeff_[i] = rnd_.Rand16() >> 1; + rnd_.Reset(0xb0c8); + for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = rnd_.Rand16() >> 1; + } + + void Check(const int64_t expected) { + int64_t total; + ASM_REGISTER_STATE_CHECK( + total = block_error_func_(coeff_, dqcoeff_, txfm_size_)); + EXPECT_EQ(expected, total); + } + + int txfm_size_; + + private: + tran_low_t *coeff_; + tran_low_t *dqcoeff_; + BlockErrorFunc block_error_func_; + ACMRandom rnd_; +}; + uint8_t *AverageTestBase::source_data_ = NULL; TEST_P(AverageTest, MinValue) { @@ -307,6 +368,66 @@ TEST_P(SatdTest, Random) { Check(expected); } +TEST_P(SatdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + DECLARE_ALIGNED(16, tran_low_t, coeff[1024]); + const int blocksize = GET_PARAM(0); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + +TEST_P(BlockErrorTestFP, MinValue) { + const int64_t kMin = -32640; + const int64_t expected = kMin * kMin * txfm_size_; + FillConstant(kMin, 0); + Check(expected); +} + +TEST_P(BlockErrorTestFP, MaxValue) { + const int64_t kMax = 32640; + const int64_t expected = kMax * kMax * txfm_size_; + FillConstant(kMax, 0); + Check(expected); +} + +TEST_P(BlockErrorTestFP, Random) { + int64_t expected; + switch (txfm_size_) { + case 16: expected = 2051681432; break; + case 64: expected = 11075114379; break; + case 256: expected = 44386271116; break; + case 1024: expected = 184774996089; break; + default: + FAIL() << "Invalid satd size (" << txfm_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + +TEST_P(BlockErrorTestFP, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + DECLARE_ALIGNED(16, tran_low_t, coeff[1024]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]); + const int blocksize = GET_PARAM(0); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, dqcoeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + using std::tr1::make_tuple; INSTANTIATE_TEST_CASE_P( @@ -320,6 +441,13 @@ INSTANTIATE_TEST_CASE_P(C, SatdTest, make_tuple(256, &vpx_satd_c), make_tuple(1024, &vpx_satd_c))); +INSTANTIATE_TEST_CASE_P( + C, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_c), + make_tuple(64, &vp9_block_error_fp_c), + make_tuple(256, &vp9_block_error_fp_c), + make_tuple(1024, &vp9_block_error_fp_c))); + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, AverageTest, @@ -349,6 +477,28 @@ INSTANTIATE_TEST_CASE_P(SSE2, SatdTest, make_tuple(64, &vpx_satd_sse2), make_tuple(256, &vpx_satd_sse2), make_tuple(1024, &vpx_satd_sse2))); + +INSTANTIATE_TEST_CASE_P( + SSE2, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2), + make_tuple(64, &vp9_block_error_fp_sse2), + make_tuple(256, &vp9_block_error_fp_sse2), + make_tuple(1024, &vp9_block_error_fp_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P(AVX2, SatdTest, + ::testing::Values(make_tuple(16, &vpx_satd_avx2), + make_tuple(64, &vpx_satd_avx2), + make_tuple(256, &vpx_satd_avx2), + make_tuple(1024, &vpx_satd_avx2))); + +INSTANTIATE_TEST_CASE_P( + AVX2, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), + make_tuple(64, &vp9_block_error_fp_avx2), + make_tuple(256, &vp9_block_error_fp_avx2), + make_tuple(1024, &vp9_block_error_fp_avx2))); #endif #if HAVE_NEON @@ -380,7 +530,18 @@ INSTANTIATE_TEST_CASE_P(NEON, SatdTest, make_tuple(64, &vpx_satd_neon), make_tuple(256, &vpx_satd_neon), make_tuple(1024, &vpx_satd_neon))); -#endif + +// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are +// in place. +#if !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + NEON, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon), + make_tuple(64, &vp9_block_error_fp_neon), + make_tuple(256, &vp9_block_error_fp_neon), + make_tuple(1024, &vp9_block_error_fp_neon))); +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON #if HAVE_MSA INSTANTIATE_TEST_CASE_P( @@ -391,6 +552,30 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa), make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa))); -#endif + +INSTANTIATE_TEST_CASE_P( + MSA, IntProRowTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c), + make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c), + make_tuple(64, &vpx_int_pro_row_msa, + &vpx_int_pro_row_c))); + +INSTANTIATE_TEST_CASE_P( + MSA, IntProColTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c), + make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c), + make_tuple(64, &vpx_int_pro_col_msa, + &vpx_int_pro_col_c))); + +// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are +// in place. +#if !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P(MSA, SatdTest, + ::testing::Values(make_tuple(16, &vpx_satd_msa), + make_tuple(64, &vpx_satd_msa), + make_tuple(256, &vpx_satd_msa), + make_tuple(1024, &vpx_satd_msa))); +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA } // namespace diff --git a/libs/libvpx/test/buffer.h b/libs/libvpx/test/buffer.h new file mode 100644 index 0000000000..2175dad9d9 --- /dev/null +++ b/libs/libvpx/test/buffer.h @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_BUFFER_H_ +#define TEST_BUFFER_H_ + +#include + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/acm_random.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +namespace libvpx_test { + +template +class Buffer { + public: + Buffer(int width, int height, int top_padding, int left_padding, + int right_padding, int bottom_padding) + : width_(width), height_(height), top_padding_(top_padding), + left_padding_(left_padding), right_padding_(right_padding), + bottom_padding_(bottom_padding), alignment_(0), padding_value_(0), + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + + Buffer(int width, int height, int top_padding, int left_padding, + int right_padding, int bottom_padding, unsigned int alignment) + : width_(width), height_(height), top_padding_(top_padding), + left_padding_(left_padding), right_padding_(right_padding), + bottom_padding_(bottom_padding), alignment_(alignment), + padding_value_(0), stride_(0), raw_size_(0), num_elements_(0), + raw_buffer_(NULL) {} + + Buffer(int width, int height, int padding) + : width_(width), height_(height), top_padding_(padding), + left_padding_(padding), right_padding_(padding), + bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0), + raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + + Buffer(int width, int height, int padding, unsigned int alignment) + : width_(width), height_(height), top_padding_(padding), + left_padding_(padding), right_padding_(padding), + bottom_padding_(padding), alignment_(alignment), padding_value_(0), + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {} + + ~Buffer() { + if (alignment_) { + vpx_free(raw_buffer_); + } else { + delete[] raw_buffer_; + } + } + + T *TopLeftPixel() const; + + int stride() const { return stride_; } + + // Set the buffer (excluding padding) to 'value'. + void Set(const T value); + + // Set the buffer (excluding padding) to the output of ACMRandom function + // 'rand_func'. + void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()); + + // Set the buffer (excluding padding) to the output of ACMRandom function + // 'RandRange' with range 'low' to 'high' which typically must be within + // testing::internal::Random::kMaxRange (1u << 31). However, because we want + // to allow negative low (and high) values, it is restricted to INT32_MAX + // here. + void Set(ACMRandom *rand_class, const T low, const T high); + + // Copy the contents of Buffer 'a' (excluding padding). + void CopyFrom(const Buffer &a); + + void DumpBuffer() const; + + // Highlight the differences between two buffers if they are the same size. + void PrintDifference(const Buffer &a) const; + + bool HasPadding() const; + + // Sets all the values in the buffer to 'padding_value'. + void SetPadding(const T padding_value); + + // Checks if all the values (excluding padding) are equal to 'value' if the + // Buffers are the same size. + bool CheckValues(const T value) const; + + // Check that padding matches the expected value or there is no padding. + bool CheckPadding() const; + + // Compare the non-padding portion of two buffers if they are the same size. + bool CheckValues(const Buffer &a) const; + + bool Init() { + if (raw_buffer_ != NULL) return false; + EXPECT_GT(width_, 0); + EXPECT_GT(height_, 0); + EXPECT_GE(top_padding_, 0); + EXPECT_GE(left_padding_, 0); + EXPECT_GE(right_padding_, 0); + EXPECT_GE(bottom_padding_, 0); + stride_ = left_padding_ + width_ + right_padding_; + num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_); + raw_size_ = num_elements_ * sizeof(T); + if (alignment_) { + EXPECT_GE(alignment_, sizeof(T)); + // Ensure alignment of the first value will be preserved. + EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u); + // Ensure alignment of the subsequent rows will be preserved when there is + // a stride. + if (stride_ != width_) { + EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u); + } + raw_buffer_ = reinterpret_cast(vpx_memalign(alignment_, raw_size_)); + } else { + raw_buffer_ = new (std::nothrow) T[num_elements_]; + } + EXPECT_TRUE(raw_buffer_ != NULL); + SetPadding(std::numeric_limits::max()); + return !::testing::Test::HasFailure(); + } + + private: + bool BufferSizesMatch(const Buffer &a) const; + + const int width_; + const int height_; + const int top_padding_; + const int left_padding_; + const int right_padding_; + const int bottom_padding_; + const unsigned int alignment_; + T padding_value_; + int stride_; + int raw_size_; + int num_elements_; + T *raw_buffer_; +}; + +template +T *Buffer::TopLeftPixel() const { + if (!raw_buffer_) return NULL; + return raw_buffer_ + (top_padding_ * stride_) + left_padding_; +} + +template +void Buffer::Set(const T value) { + if (!raw_buffer_) return; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + src[width] = value; + } + src += stride_; + } +} + +template +void Buffer::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) { + if (!raw_buffer_) return; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + src[width] = (*rand_class.*rand_func)(); + } + src += stride_; + } +} + +template +void Buffer::Set(ACMRandom *rand_class, const T low, const T high) { + if (!raw_buffer_) return; + + EXPECT_LE(low, high); + EXPECT_LE(static_cast(high) - low, + std::numeric_limits::max()); + + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + // 'low' will be promoted to unsigned given the return type of RandRange. + // Store the value as an int to avoid unsigned overflow warnings when + // 'low' is negative. + const int32_t value = + static_cast((*rand_class).RandRange(high - low)); + src[width] = static_cast(value + low); + } + src += stride_; + } +} + +template +void Buffer::CopyFrom(const Buffer &a) { + if (!raw_buffer_) return; + if (!BufferSizesMatch(a)) return; + + T *a_src = a.TopLeftPixel(); + T *b_src = this->TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + b_src[width] = a_src[width]; + } + a_src += a.stride(); + b_src += this->stride(); + } +} + +template +void Buffer::DumpBuffer() const { + if (!raw_buffer_) return; + for (int height = 0; height < height_ + top_padding_ + bottom_padding_; + ++height) { + for (int width = 0; width < stride_; ++width) { + printf("%4d", raw_buffer_[height + width * stride_]); + } + printf("\n"); + } +} + +template +bool Buffer::HasPadding() const { + if (!raw_buffer_) return false; + return top_padding_ || left_padding_ || right_padding_ || bottom_padding_; +} + +template +void Buffer::PrintDifference(const Buffer &a) const { + if (!raw_buffer_) return; + if (!BufferSizesMatch(a)) return; + + T *a_src = a.TopLeftPixel(); + T *b_src = TopLeftPixel(); + + printf("This buffer:\n"); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + printf("*%3d", b_src[width]); + } else { + printf("%4d", b_src[width]); + } + } + printf("\n"); + a_src += a.stride(); + b_src += this->stride(); + } + + a_src = a.TopLeftPixel(); + b_src = TopLeftPixel(); + + printf("Reference buffer:\n"); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + printf("*%3d", a_src[width]); + } else { + printf("%4d", a_src[width]); + } + } + printf("\n"); + a_src += a.stride(); + b_src += this->stride(); + } +} + +template +void Buffer::SetPadding(const T padding_value) { + if (!raw_buffer_) return; + padding_value_ = padding_value; + + T *src = raw_buffer_; + for (int i = 0; i < num_elements_; ++i) { + src[i] = padding_value; + } +} + +template +bool Buffer::CheckValues(const T value) const { + if (!raw_buffer_) return false; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (value != src[width]) { + return false; + } + } + src += stride_; + } + return true; +} + +template +bool Buffer::CheckPadding() const { + if (!raw_buffer_) return false; + if (!HasPadding()) return true; + + // Top padding. + T const *top = raw_buffer_; + for (int i = 0; i < stride_ * top_padding_; ++i) { + if (padding_value_ != top[i]) { + return false; + } + } + + // Left padding. + T const *left = TopLeftPixel() - left_padding_; + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < left_padding_; ++width) { + if (padding_value_ != left[width]) { + return false; + } + } + left += stride_; + } + + // Right padding. + T const *right = TopLeftPixel() + width_; + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < right_padding_; ++width) { + if (padding_value_ != right[width]) { + return false; + } + } + right += stride_; + } + + // Bottom padding + T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_; + for (int i = 0; i < stride_ * bottom_padding_; ++i) { + if (padding_value_ != bottom[i]) { + return false; + } + } + + return true; +} + +template +bool Buffer::CheckValues(const Buffer &a) const { + if (!raw_buffer_) return false; + if (!BufferSizesMatch(a)) return false; + + T *a_src = a.TopLeftPixel(); + T *b_src = this->TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + return false; + } + } + a_src += a.stride(); + b_src += this->stride(); + } + return true; +} + +template +bool Buffer::BufferSizesMatch(const Buffer &a) const { + if (!raw_buffer_) return false; + if (a.width_ != this->width_ || a.height_ != this->height_) { + printf( + "Reference buffer of size %dx%d does not match this buffer which is " + "size %dx%d\n", + a.width_, a.height_, this->width_, this->height_); + return false; + } + + return true; +} +} // namespace libvpx_test +#endif // TEST_BUFFER_H_ diff --git a/libs/libvpx/test/byte_alignment_test.cc b/libs/libvpx/test/byte_alignment_test.cc index d78294d10f..5a058b2756 100644 --- a/libs/libvpx/test/byte_alignment_test.cc +++ b/libs/libvpx/test/byte_alignment_test.cc @@ -128,8 +128,8 @@ class ByteAlignmentTest // TODO(fgalligan): Move the MD5 testing code into another class. void OpenMd5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: " - << md5_file_name_; + ASSERT_TRUE(md5_file_ != NULL) + << "MD5 file open failed. Filename: " << md5_file_name_; } void CheckMd5(const vpx_image_t &img) { diff --git a/libs/libvpx/test/codec_factory.h b/libs/libvpx/test/codec_factory.h index e867dacafe..d5882ed9c8 100644 --- a/libs/libvpx/test/codec_factory.h +++ b/libs/libvpx/test/codec_factory.h @@ -65,6 +65,12 @@ class CodecTestWith3Params : public ::testing::TestWithParam< std::tr1::tuple > {}; +template +class CodecTestWith4Params + : public ::testing::TestWithParam< + std::tr1::tuple > { +}; + /* * VP8 Codec Definitions */ @@ -115,6 +121,8 @@ class VP8CodecFactory : public CodecFactory { #if CONFIG_VP8_DECODER return new VP8Decoder(cfg, flags); #else + (void)cfg; + (void)flags; return NULL; #endif } @@ -126,6 +134,10 @@ class VP8CodecFactory : public CodecFactory { #if CONFIG_VP8_ENCODER return new VP8Encoder(cfg, deadline, init_flags, stats); #else + (void)cfg; + (void)deadline; + (void)init_flags; + (void)stats; return NULL; #endif } @@ -135,6 +147,8 @@ class VP8CodecFactory : public CodecFactory { #if CONFIG_VP8_ENCODER return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage); #else + (void)cfg; + (void)usage; return VPX_CODEC_INCAPABLE; #endif } @@ -203,6 +217,8 @@ class VP9CodecFactory : public CodecFactory { #if CONFIG_VP9_DECODER return new VP9Decoder(cfg, flags); #else + (void)cfg; + (void)flags; return NULL; #endif } @@ -214,6 +230,10 @@ class VP9CodecFactory : public CodecFactory { #if CONFIG_VP9_ENCODER return new VP9Encoder(cfg, deadline, init_flags, stats); #else + (void)cfg; + (void)deadline; + (void)init_flags; + (void)stats; return NULL; #endif } @@ -223,6 +243,8 @@ class VP9CodecFactory : public CodecFactory { #if CONFIG_VP9_ENCODER return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage); #else + (void)cfg; + (void)usage; return VPX_CODEC_INCAPABLE; #endif } diff --git a/libs/libvpx/test/comp_avg_pred_test.cc b/libs/libvpx/test/comp_avg_pred_test.cc new file mode 100644 index 0000000000..110e065836 --- /dev/null +++ b/libs/libvpx/test/comp_avg_pred_test.cc @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/register_state_check.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +using ::libvpx_test::ACMRandom; +using ::libvpx_test::Buffer; + +typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride); + +uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; } + +void reference_pred(const Buffer &pred, const Buffer &ref, + int width, int height, Buffer *avg) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + avg->TopLeftPixel()[y * avg->stride() + x] = + avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x], + ref.TopLeftPixel()[y * ref.stride() + x]); + } + } +} + +class AvgPredTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + avg_pred_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + protected: + AvgPredFunc avg_pred_func_; + ACMRandom rnd_; +}; + +TEST_P(AvgPredTest, SizeCombinations) { + // This is called as part of the sub pixel variance. As such it must be one of + // the variance block sizes. + + for (int width_pow = 2; width_pow <= 6; ++width_pow) { + for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; + ++height_pow) { + // Don't test 4x2 or 64x128 + if (height_pow == 1 || height_pow == 7) continue; + + // The sse2 special-cases when ref width == stride, so make sure to test + // it. + for (int ref_padding = 0; ref_padding < 2; ref_padding++) { + const int width = 1 << width_pow; + const int height = 1 << height_pow; + // Only the reference buffer may have a stride not equal to width. + Buffer ref = + Buffer(width, height, ref_padding ? 8 : 0); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 16); + ASSERT_TRUE(pred.Init()); + Buffer avg_ref = Buffer(width, height, 0, 16); + ASSERT_TRUE(avg_ref.Init()); + Buffer avg_chk = Buffer(width, height, 0, 16); + ASSERT_TRUE(avg_chk.Init()); + + ref.Set(&rnd_, &ACMRandom::Rand8); + pred.Set(&rnd_, &ACMRandom::Rand8); + + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK( + avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width, + height, ref.TopLeftPixel(), ref.stride())); + + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + avg_chk.PrintDifference(avg_ref); + return; + } + } + } + } +} + +TEST_P(AvgPredTest, CompareReferenceRandom) { + const int width = 64; + const int height = 32; + Buffer ref = Buffer(width, height, 8); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 16); + ASSERT_TRUE(pred.Init()); + Buffer avg_ref = Buffer(width, height, 0, 16); + ASSERT_TRUE(avg_ref.Init()); + Buffer avg_chk = Buffer(width, height, 0, 16); + ASSERT_TRUE(avg_chk.Init()); + + for (int i = 0; i < 500; ++i) { + ref.Set(&rnd_, &ACMRandom::Rand8); + pred.Set(&rnd_, &ACMRandom::Rand8); + + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(), + pred.TopLeftPixel(), width, height, + ref.TopLeftPixel(), ref.stride())); + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + avg_chk.PrintDifference(avg_ref); + return; + } + } +} + +TEST_P(AvgPredTest, DISABLED_Speed) { + for (int width_pow = 2; width_pow <= 6; ++width_pow) { + for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; + ++height_pow) { + // Don't test 4x2 or 64x128 + if (height_pow == 1 || height_pow == 7) continue; + + for (int ref_padding = 0; ref_padding < 2; ref_padding++) { + const int width = 1 << width_pow; + const int height = 1 << height_pow; + Buffer ref = + Buffer(width, height, ref_padding ? 8 : 0); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 16); + ASSERT_TRUE(pred.Init()); + Buffer avg = Buffer(width, height, 0, 16); + ASSERT_TRUE(avg.Init()); + + ref.Set(&rnd_, &ACMRandom::Rand8); + pred.Set(&rnd_, &ACMRandom::Rand8); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 10000000 / (width * height); ++i) { + avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height, + ref.TopLeftPixel(), ref.stride()); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Average Test (ref_padding: %d) %dx%d time: %5d us\n", + ref_padding, width, height, elapsed_time); + } + } + } +} + +INSTANTIATE_TEST_CASE_P(C, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_sse2)); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_neon)); +#endif // HAVE_NEON + +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest, + ::testing::Values(&vpx_comp_avg_pred_vsx)); +#endif // HAVE_VSX +} // namespace diff --git a/libs/libvpx/test/convolve_test.cc b/libs/libvpx/test/convolve_test.cc index 432d1b009e..70f0b11a77 100644 --- a/libs/libvpx/test/convolve_test.cc +++ b/libs/libvpx/test/convolve_test.cc @@ -12,8 +12,8 @@ #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" #include "test/clear_system_state.h" @@ -25,6 +25,7 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" namespace { @@ -32,9 +33,15 @@ static const unsigned int kMaxDimension = 64; typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h); + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h); + +typedef void (*WrapperFilterBlock2d8Func)( + const uint8_t *src_ptr, const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr, + unsigned int dst_stride, unsigned int output_width, + unsigned int output_height, int use_highbd); struct ConvolveFunctions { ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, ConvolveFunc h8, @@ -43,25 +50,30 @@ struct ConvolveFunctions { ConvolveFunc sh8_avg, ConvolveFunc sv8, ConvolveFunc sv8_avg, ConvolveFunc shv8, ConvolveFunc shv8_avg, int bd) - : copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), - v8_avg_(v8_avg), hv8_avg_(hv8_avg), sh8_(sh8), sv8_(sv8), shv8_(shv8), - sh8_avg_(sh8_avg), sv8_avg_(sv8_avg), shv8_avg_(shv8_avg), - use_highbd_(bd) {} + : use_highbd_(bd) { + copy_[0] = copy; + copy_[1] = avg; + h8_[0] = h8; + h8_[1] = h8_avg; + v8_[0] = v8; + v8_[1] = v8_avg; + hv8_[0] = hv8; + hv8_[1] = hv8_avg; + sh8_[0] = sh8; + sh8_[1] = sh8_avg; + sv8_[0] = sv8; + sv8_[1] = sv8_avg; + shv8_[0] = shv8; + shv8_[1] = shv8_avg; + } - ConvolveFunc copy_; - ConvolveFunc avg_; - ConvolveFunc h8_; - ConvolveFunc v8_; - ConvolveFunc hv8_; - ConvolveFunc h8_avg_; - ConvolveFunc v8_avg_; - ConvolveFunc hv8_avg_; - ConvolveFunc sh8_; // scaled horiz - ConvolveFunc sv8_; // scaled vert - ConvolveFunc shv8_; // scaled horiz/vert - ConvolveFunc sh8_avg_; // scaled avg horiz - ConvolveFunc sv8_avg_; // scaled avg vert - ConvolveFunc shv8_avg_; // scaled avg horiz/vert + ConvolveFunc copy_[2]; + ConvolveFunc h8_[2]; + ConvolveFunc v8_[2]; + ConvolveFunc hv8_[2]; + ConvolveFunc sh8_[2]; // scaled horiz + ConvolveFunc sv8_[2]; // scaled vert + ConvolveFunc shv8_[2]; // scaled horiz/vert int use_highbd_; // 0 if high bitdepth not used, else the actual bit depth. }; @@ -82,7 +94,7 @@ typedef std::tr1::tuple ConvolveParam; uint8_t clip_pixel(int x) { return x < 0 ? 0 : x > 255 ? 255 : x; } void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride, - const int16_t *HFilter, const int16_t *VFilter, + const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr, unsigned int dst_stride, unsigned int output_width, unsigned int output_height) { // Between passes, we use an intermediate buffer whose height is extended to @@ -112,10 +124,10 @@ void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride, for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < output_width; ++j) { // Apply filter... - const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) + - (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) + - (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) + - (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) + + const int temp = (src_ptr[0] * hfilter[0]) + (src_ptr[1] * hfilter[1]) + + (src_ptr[2] * hfilter[2]) + (src_ptr[3] * hfilter[3]) + + (src_ptr[4] * hfilter[4]) + (src_ptr[5] * hfilter[5]) + + (src_ptr[6] * hfilter[6]) + (src_ptr[7] * hfilter[7]) + (VP9_FILTER_WEIGHT >> 1); // Rounding // Normalize back to 0-255... @@ -133,10 +145,10 @@ void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride, for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { // Apply filter... - const int temp = (src_ptr[0] * VFilter[0]) + (src_ptr[1] * VFilter[1]) + - (src_ptr[2] * VFilter[2]) + (src_ptr[3] * VFilter[3]) + - (src_ptr[4] * VFilter[4]) + (src_ptr[5] * VFilter[5]) + - (src_ptr[6] * VFilter[6]) + (src_ptr[7] * VFilter[7]) + + const int temp = (src_ptr[0] * vfilter[0]) + (src_ptr[1] * vfilter[1]) + + (src_ptr[2] * vfilter[2]) + (src_ptr[3] * vfilter[3]) + + (src_ptr[4] * vfilter[4]) + (src_ptr[5] * vfilter[5]) + + (src_ptr[6] * vfilter[6]) + (src_ptr[7] * vfilter[7]) + (VP9_FILTER_WEIGHT >> 1); // Rounding // Normalize back to 0-255... @@ -162,7 +174,7 @@ void block2d_average_c(uint8_t *src, unsigned int src_stride, void filter_average_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride, - const int16_t *HFilter, const int16_t *VFilter, + const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr, unsigned int dst_stride, unsigned int output_width, unsigned int output_height) { @@ -170,7 +182,7 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr, assert(output_width <= kMaxDimension); assert(output_height <= kMaxDimension); - filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64, + filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, tmp, 64, output_width, output_height); block2d_average_c(tmp, 64, dst_ptr, dst_stride, output_width, output_height); } @@ -178,7 +190,7 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr, #if CONFIG_VP9_HIGHBITDEPTH void highbd_filter_block2d_8_c(const uint16_t *src_ptr, const unsigned int src_stride, - const int16_t *HFilter, const int16_t *VFilter, + const int16_t *hfilter, const int16_t *vfilter, uint16_t *dst_ptr, unsigned int dst_stride, unsigned int output_width, unsigned int output_height, int bd) { @@ -210,10 +222,10 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr, for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < output_width; ++j) { // Apply filter... - const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) + - (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) + - (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) + - (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) + + const int temp = (src_ptr[0] * hfilter[0]) + (src_ptr[1] * hfilter[1]) + + (src_ptr[2] * hfilter[2]) + (src_ptr[3] * hfilter[3]) + + (src_ptr[4] * hfilter[4]) + (src_ptr[5] * hfilter[5]) + + (src_ptr[6] * hfilter[6]) + (src_ptr[7] * hfilter[7]) + (VP9_FILTER_WEIGHT >> 1); // Rounding // Normalize back to 0-255... @@ -234,10 +246,10 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr, for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { // Apply filter... - const int temp = (src_ptr[0] * VFilter[0]) + (src_ptr[1] * VFilter[1]) + - (src_ptr[2] * VFilter[2]) + (src_ptr[3] * VFilter[3]) + - (src_ptr[4] * VFilter[4]) + (src_ptr[5] * VFilter[5]) + - (src_ptr[6] * VFilter[6]) + (src_ptr[7] * VFilter[7]) + + const int temp = (src_ptr[0] * vfilter[0]) + (src_ptr[1] * vfilter[1]) + + (src_ptr[2] * vfilter[2]) + (src_ptr[3] * vfilter[3]) + + (src_ptr[4] * vfilter[4]) + (src_ptr[5] * vfilter[5]) + + (src_ptr[6] * vfilter[6]) + (src_ptr[7] * vfilter[7]) + (VP9_FILTER_WEIGHT >> 1); // Rounding // Normalize back to 0-255... @@ -265,20 +277,64 @@ void highbd_block2d_average_c(uint16_t *src, unsigned int src_stride, void highbd_filter_average_block2d_8_c( const uint16_t *src_ptr, const unsigned int src_stride, - const int16_t *HFilter, const int16_t *VFilter, uint16_t *dst_ptr, + const int16_t *hfilter, const int16_t *vfilter, uint16_t *dst_ptr, unsigned int dst_stride, unsigned int output_width, unsigned int output_height, int bd) { uint16_t tmp[kMaxDimension * kMaxDimension]; assert(output_width <= kMaxDimension); assert(output_height <= kMaxDimension); - highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64, + highbd_filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, tmp, 64, output_width, output_height, bd); highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride, output_width, output_height); } #endif // CONFIG_VP9_HIGHBITDEPTH +void wrapper_filter_average_block2d_8_c( + const uint8_t *src_ptr, const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr, + unsigned int dst_stride, unsigned int output_width, + unsigned int output_height, int use_highbd) { +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbd == 0) { + filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, + dst_stride, output_width, output_height); + } else { + highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, + hfilter, vfilter, + CAST_TO_SHORTPTR(dst_ptr), dst_stride, + output_width, output_height, use_highbd); + } +#else + ASSERT_EQ(0, use_highbd); + filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, + dst_stride, output_width, output_height); +#endif +} + +void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, + uint8_t *dst_ptr, unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height, int use_highbd) { +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbd == 0) { + filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, + dst_stride, output_width, output_height); + } else { + highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter, + vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride, + output_width, output_height, use_highbd); + } +#else + ASSERT_EQ(0, use_highbd); + filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, dst_stride, + output_width, output_height); +#endif +} + class ConvolveTest : public ::testing::TestWithParam { public: static void SetUpTestCase() { @@ -404,7 +460,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { return input_ + offset; } else { - return CONVERT_TO_BYTEPTR(input16_) + offset; + return CAST_TO_BYTEPTR(input16_ + offset); } #else return input_ + offset; @@ -417,7 +473,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { return output_ + offset; } else { - return CONVERT_TO_BYTEPTR(output16_) + offset; + return CAST_TO_BYTEPTR(output16_ + offset); } #else return output_ + offset; @@ -430,7 +486,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { return output_ref_ + offset; } else { - return CONVERT_TO_BYTEPTR(output16_ref_) + offset; + return CAST_TO_BYTEPTR(output16_ref_ + offset); } #else return output_ref_ + offset; @@ -442,7 +498,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { return list[index]; } else { - return CONVERT_TO_SHORTPTR(list)[index]; + return CAST_TO_SHORTPTR(list)[index]; } #else return list[index]; @@ -454,57 +510,13 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { list[index] = (uint8_t)val; } else { - CONVERT_TO_SHORTPTR(list)[index] = val; + CAST_TO_SHORTPTR(list)[index] = val; } #else list[index] = (uint8_t)val; #endif } - void wrapper_filter_average_block2d_8_c( - const uint8_t *src_ptr, const unsigned int src_stride, - const int16_t *HFilter, const int16_t *VFilter, uint8_t *dst_ptr, - unsigned int dst_stride, unsigned int output_width, - unsigned int output_height) { -#if CONFIG_VP9_HIGHBITDEPTH - if (UUT_->use_highbd_ == 0) { - filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr, - dst_stride, output_width, output_height); - } else { - highbd_filter_average_block2d_8_c( - CONVERT_TO_SHORTPTR(src_ptr), src_stride, HFilter, VFilter, - CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height, - UUT_->use_highbd_); - } -#else - filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr, - dst_stride, output_width, output_height); -#endif - } - - void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter, - const int16_t *VFilter, uint8_t *dst_ptr, - unsigned int dst_stride, - unsigned int output_width, - unsigned int output_height) { -#if CONFIG_VP9_HIGHBITDEPTH - if (UUT_->use_highbd_ == 0) { - filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr, - dst_stride, output_width, output_height); - } else { - highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, - HFilter, VFilter, CONVERT_TO_SHORTPTR(dst_ptr), - dst_stride, output_width, output_height, - UUT_->use_highbd_); - } -#else - filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr, - dst_stride, output_width, output_height); -#endif - } - const ConvolveFunctions *UUT_; static uint8_t *input_; static uint8_t *output_; @@ -528,12 +540,167 @@ uint16_t *ConvolveTest::output16_ref_ = NULL; TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); } +TEST_P(ConvolveTest, DISABLED_Copy_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_copy_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_Avg_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_avg_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_Scale_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_scale_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_vert_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_avg_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + TEST_P(ConvolveTest, Copy) { uint8_t *const in = input(); uint8_t *const out = output(); - ASM_REGISTER_STATE_CHECK(UUT_->copy_(in, kInputStride, out, kOutputStride, - NULL, 0, NULL, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride, + NULL, 0, 0, 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -551,8 +718,8 @@ TEST_P(ConvolveTest, Avg) { uint8_t *const out_ref = output_ref(); CopyOutputToRef(); - ASM_REGISTER_STATE_CHECK(UUT_->avg_(in, kInputStride, out, kOutputStride, - NULL, 0, NULL, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride, + NULL, 0, 0, 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -569,12 +736,10 @@ TEST_P(ConvolveTest, Avg) { TEST_P(ConvolveTest, CopyHoriz) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; - ASM_REGISTER_STATE_CHECK(UUT_->sh8_(in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride, + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -589,12 +754,10 @@ TEST_P(ConvolveTest, CopyHoriz) { TEST_P(ConvolveTest, CopyVert) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; - ASM_REGISTER_STATE_CHECK(UUT_->sv8_(in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride, + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -609,12 +772,10 @@ TEST_P(ConvolveTest, CopyVert) { TEST_P(ConvolveTest, Copy2D) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; - ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride, + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -650,138 +811,84 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { } } -const int16_t kInvalidFilter[8] = { 0 }; +const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = { + wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c +}; TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { - uint8_t *const in = input(); - uint8_t *const out = output(); + for (int i = 0; i < 2; ++i) { + uint8_t *const in = input(); + uint8_t *const out = output(); #if CONFIG_VP9_HIGHBITDEPTH - uint8_t ref8[kOutputStride * kMaxDimension]; - uint16_t ref16[kOutputStride * kMaxDimension]; - uint8_t *ref; - if (UUT_->use_highbd_ == 0) { - ref = ref8; - } else { - ref = CONVERT_TO_BYTEPTR(ref16); - } + uint8_t ref8[kOutputStride * kMaxDimension]; + uint16_t ref16[kOutputStride * kMaxDimension]; + uint8_t *ref; + if (UUT_->use_highbd_ == 0) { + ref = ref8; + } else { + ref = CAST_TO_BYTEPTR(ref16); + } #else - uint8_t ref[kOutputStride * kMaxDimension]; + uint8_t ref[kOutputStride * kMaxDimension]; #endif - for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { - const InterpKernel *filters = - vp9_filter_kernels[static_cast(filter_bank)]; - - for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { - for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { - wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x], - filters[filter_y], ref, kOutputStride, - Width(), Height()); - - if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); - else if (filter_y) - ASM_REGISTER_STATE_CHECK( - UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter, - 16, filters[filter_y], 16, Width(), Height())); - else if (filter_x) - ASM_REGISTER_STATE_CHECK( - UUT_->h8_(in, kInputStride, out, kOutputStride, filters[filter_x], - 16, kInvalidFilter, 16, Width(), Height())); - else - ASM_REGISTER_STATE_CHECK( - UUT_->copy_(in, kInputStride, out, kOutputStride, kInvalidFilter, - 0, kInvalidFilter, 0, Width(), Height())); - - CheckGuardBlocks(); - - for (int y = 0; y < Height(); ++y) { - for (int x = 0; x < Width(); ++x) - ASSERT_EQ(lookup(ref, y * kOutputStride + x), - lookup(out, y * kOutputStride + x)) - << "mismatch at (" << x << "," << y << "), " - << "filters (" << filter_bank << "," << filter_x << "," - << filter_y << ")"; + // Populate ref and out with some random data + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + uint16_t r; +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; } - } - } - } -} - -TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) { - uint8_t *const in = input(); - uint8_t *const out = output(); -#if CONFIG_VP9_HIGHBITDEPTH - uint8_t ref8[kOutputStride * kMaxDimension]; - uint16_t ref16[kOutputStride * kMaxDimension]; - uint8_t *ref; - if (UUT_->use_highbd_ == 0) { - ref = ref8; - } else { - ref = CONVERT_TO_BYTEPTR(ref16); - } #else - uint8_t ref[kOutputStride * kMaxDimension]; -#endif - - // Populate ref and out with some random data - ::libvpx_test::ACMRandom prng; - for (int y = 0; y < Height(); ++y) { - for (int x = 0; x < Width(); ++x) { - uint16_t r; -#if CONFIG_VP9_HIGHBITDEPTH - if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) { r = prng.Rand8Extremes(); - } else { - r = prng.Rand16() & mask_; - } -#else - r = prng.Rand8Extremes(); #endif - assign_val(out, y * kOutputStride + x, r); - assign_val(ref, y * kOutputStride + x, r); + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); + } } - } - for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { - const InterpKernel *filters = - vp9_filter_kernels[static_cast(filter_bank)]; + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const InterpKernel *filters = + vp9_filter_kernels[static_cast(filter_bank)]; - for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { - for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { - wrapper_filter_average_block2d_8_c(in, kInputStride, filters[filter_x], - filters[filter_y], ref, - kOutputStride, Width(), Height()); + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + wrapper_filter_block2d_8[i](in, kInputStride, filters[filter_x], + filters[filter_y], ref, kOutputStride, + Width(), Height(), UUT_->use_highbd_); - if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_avg_( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); - else if (filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->v8_avg_( - in, kInputStride, out, kOutputStride, kInvalidFilter, 16, - filters[filter_y], 16, Width(), Height())); - else if (filter_x) - ASM_REGISTER_STATE_CHECK(UUT_->h8_avg_( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - kInvalidFilter, 16, Width(), Height())); - else - ASM_REGISTER_STATE_CHECK( - UUT_->avg_(in, kInputStride, out, kOutputStride, kInvalidFilter, - 0, kInvalidFilter, 0, Width(), Height())); + if (filter_x && filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); + else if (filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); + else if (filter_x) + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); + else + ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out, + kOutputStride, NULL, 0, 0, + 0, 0, Width(), Height())); - CheckGuardBlocks(); + CheckGuardBlocks(); - for (int y = 0; y < Height(); ++y) { - for (int x = 0; x < Width(); ++x) - ASSERT_EQ(lookup(ref, y * kOutputStride + x), - lookup(out, y * kOutputStride + x)) - << "mismatch at (" << x << "," << y << "), " - << "filters (" << filter_bank << "," << filter_x << "," - << filter_y << ")"; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "mismatch at (" << x << "," << y << "), " + << "filters (" << filter_bank << "," << filter_x << "," + << filter_y << ")"; + } } } } @@ -798,7 +905,7 @@ TEST_P(ConvolveTest, FilterExtremes) { if (UUT_->use_highbd_ == 0) { ref = ref8; } else { - ref = CONVERT_TO_BYTEPTR(ref16); + ref = CAST_TO_BYTEPTR(ref16); } #else uint8_t ref[kOutputStride * kMaxDimension]; @@ -852,23 +959,23 @@ TEST_P(ConvolveTest, FilterExtremes) { for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x], filters[filter_y], ref, kOutputStride, - Width(), Height()); + Width(), Height(), UUT_->use_highbd_); if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); else if (filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->v8_( - in, kInputStride, out, kOutputStride, kInvalidFilter, 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); else if (filter_x) - ASM_REGISTER_STATE_CHECK(UUT_->h8_( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - kInvalidFilter, 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_( - in, kInputStride, out, kOutputStride, kInvalidFilter, 0, - kInvalidFilter, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, + kOutputStride, NULL, 0, 0, + 0, 0, Width(), Height())); for (int y = 0; y < Height(); ++y) { for (int x = 0; x < Width(); ++x) @@ -887,33 +994,51 @@ TEST_P(ConvolveTest, FilterExtremes) { /* This test exercises that enough rows and columns are filtered with every possible initial fractional positions and scaling steps. */ +#if !CONFIG_VP9_HIGHBITDEPTH +static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c, + vpx_scaled_avg_2d_c }; + TEST_P(ConvolveTest, CheckScalingFiltering) { uint8_t *const in = input(); uint8_t *const out = output(); - const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + uint8_t ref[kOutputStride * kMaxDimension]; - SetConstantInput(127); + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const uint16_t r = prng.Rand8Extremes(); + assign_val(in, y * kInputStride + x, r); + } + } - for (int frac = 0; frac < 16; ++frac) { - for (int step = 1; step <= 32; ++step) { - /* Test the horizontal and vertical filters in combination. */ - ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride, - eighttap[frac], step, eighttap[frac], - step, Width(), Height())); + for (int i = 0; i < 2; ++i) { + for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { + const InterpKernel *const eighttap = vp9_filter_kernels[filter_type]; + for (int frac = 0; frac < 16; ++frac) { + for (int step = 1; step <= 32; ++step) { + /* Test the horizontal and vertical filters in combination. */ + scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height()); + ASM_REGISTER_STATE_CHECK( + UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height())); - CheckGuardBlocks(); + CheckGuardBlocks(); - for (int y = 0; y < Height(); ++y) { - for (int x = 0; x < Width(); ++x) { - ASSERT_EQ(lookup(in, y * kInputStride + x), - lookup(out, y * kOutputStride + x)) - << "x == " << x << ", y == " << y << ", frac == " << frac - << ", step == " << step; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "x == " << x << ", y == " << y << ", frac == " << frac + << ", step == " << step; + } + } } } } } } +#endif using std::tr1::make_tuple; @@ -921,11 +1046,13 @@ using std::tr1::make_tuple; #define WRAP(func, bd) \ void wrap_##func##_##bd( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \ - const int16_t *filter_y, int filter_y_stride, int w, int h) { \ - vpx_highbd_##func(src, src_stride, dst, dst_stride, filter_x, \ - filter_x_stride, filter_y, filter_y_stride, w, h, bd); \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + vpx_highbd_##func(reinterpret_cast(src), src_stride, \ + reinterpret_cast(dst), dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ } + #if HAVE_SSE2 && ARCH_X86_64 WRAP(convolve_copy_sse2, 8) WRAP(convolve_avg_sse2, 8) @@ -953,6 +1080,62 @@ WRAP(convolve8_sse2, 12) WRAP(convolve8_avg_sse2, 12) #endif // HAVE_SSE2 && ARCH_X86_64 +#if HAVE_AVX2 +WRAP(convolve_copy_avx2, 8) +WRAP(convolve_avg_avx2, 8) +WRAP(convolve8_horiz_avx2, 8) +WRAP(convolve8_avg_horiz_avx2, 8) +WRAP(convolve8_vert_avx2, 8) +WRAP(convolve8_avg_vert_avx2, 8) +WRAP(convolve8_avx2, 8) +WRAP(convolve8_avg_avx2, 8) + +WRAP(convolve_copy_avx2, 10) +WRAP(convolve_avg_avx2, 10) +WRAP(convolve8_avx2, 10) +WRAP(convolve8_horiz_avx2, 10) +WRAP(convolve8_vert_avx2, 10) +WRAP(convolve8_avg_avx2, 10) +WRAP(convolve8_avg_horiz_avx2, 10) +WRAP(convolve8_avg_vert_avx2, 10) + +WRAP(convolve_copy_avx2, 12) +WRAP(convolve_avg_avx2, 12) +WRAP(convolve8_avx2, 12) +WRAP(convolve8_horiz_avx2, 12) +WRAP(convolve8_vert_avx2, 12) +WRAP(convolve8_avg_avx2, 12) +WRAP(convolve8_avg_horiz_avx2, 12) +WRAP(convolve8_avg_vert_avx2, 12) +#endif // HAVE_AVX2 + +#if HAVE_NEON +WRAP(convolve_copy_neon, 8) +WRAP(convolve_avg_neon, 8) +WRAP(convolve_copy_neon, 10) +WRAP(convolve_avg_neon, 10) +WRAP(convolve_copy_neon, 12) +WRAP(convolve_avg_neon, 12) +WRAP(convolve8_horiz_neon, 8) +WRAP(convolve8_avg_horiz_neon, 8) +WRAP(convolve8_vert_neon, 8) +WRAP(convolve8_avg_vert_neon, 8) +WRAP(convolve8_neon, 8) +WRAP(convolve8_avg_neon, 8) +WRAP(convolve8_horiz_neon, 10) +WRAP(convolve8_avg_horiz_neon, 10) +WRAP(convolve8_vert_neon, 10) +WRAP(convolve8_avg_vert_neon, 10) +WRAP(convolve8_neon, 10) +WRAP(convolve8_avg_neon, 10) +WRAP(convolve8_horiz_neon, 12) +WRAP(convolve8_avg_horiz_neon, 12) +WRAP(convolve8_vert_neon, 12) +WRAP(convolve8_avg_vert_neon, 12) +WRAP(convolve8_neon, 12) +WRAP(convolve8_avg_neon, 12) +#endif // HAVE_NEON + WRAP(convolve_copy_c, 8) WRAP(convolve_avg_c, 8) WRAP(convolve8_horiz_c, 8) @@ -1071,39 +1254,90 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::ValuesIn(kArrayConvolve8_ssse3)); #endif -#if HAVE_AVX2 && HAVE_SSSE3 +#if HAVE_AVX2 +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_avx2( + wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8, + wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8, + wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8, + wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_avx2( + wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10, + wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10, + wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10, + wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10, + wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, 10); +const ConvolveFunctions convolve12_avx2( + wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12, + wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12, + wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12, + wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12, + wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, 12); +const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2), + ALL_SIZES(convolve10_avx2), + ALL_SIZES(convolve12_avx2) }; +INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_avx2)); +#else // !CONFIG_VP9_HIGHBITDEPTH const ConvolveFunctions convolve8_avx2( vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2, - vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2, - vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, + vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2, + vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); - const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) }; INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::ValuesIn(kArrayConvolve8_avx2)); -#endif // HAVE_AVX2 && HAVE_SSSE3 +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_AVX2 #if HAVE_NEON -#if HAVE_NEON_ASM +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_neon( + wrap_convolve_copy_neon_8, wrap_convolve_avg_neon_8, + wrap_convolve8_horiz_neon_8, wrap_convolve8_avg_horiz_neon_8, + wrap_convolve8_vert_neon_8, wrap_convolve8_avg_vert_neon_8, + wrap_convolve8_neon_8, wrap_convolve8_avg_neon_8, + wrap_convolve8_horiz_neon_8, wrap_convolve8_avg_horiz_neon_8, + wrap_convolve8_vert_neon_8, wrap_convolve8_avg_vert_neon_8, + wrap_convolve8_neon_8, wrap_convolve8_avg_neon_8, 8); +const ConvolveFunctions convolve10_neon( + wrap_convolve_copy_neon_10, wrap_convolve_avg_neon_10, + wrap_convolve8_horiz_neon_10, wrap_convolve8_avg_horiz_neon_10, + wrap_convolve8_vert_neon_10, wrap_convolve8_avg_vert_neon_10, + wrap_convolve8_neon_10, wrap_convolve8_avg_neon_10, + wrap_convolve8_horiz_neon_10, wrap_convolve8_avg_horiz_neon_10, + wrap_convolve8_vert_neon_10, wrap_convolve8_avg_vert_neon_10, + wrap_convolve8_neon_10, wrap_convolve8_avg_neon_10, 10); +const ConvolveFunctions convolve12_neon( + wrap_convolve_copy_neon_12, wrap_convolve_avg_neon_12, + wrap_convolve8_horiz_neon_12, wrap_convolve8_avg_horiz_neon_12, + wrap_convolve8_vert_neon_12, wrap_convolve8_avg_vert_neon_12, + wrap_convolve8_neon_12, wrap_convolve8_avg_neon_12, + wrap_convolve8_horiz_neon_12, wrap_convolve8_avg_horiz_neon_12, + wrap_convolve8_vert_neon_12, wrap_convolve8_avg_vert_neon_12, + wrap_convolve8_neon_12, wrap_convolve8_avg_neon_12, 12); +const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon), + ALL_SIZES(convolve10_neon), + ALL_SIZES(convolve12_neon) }; +#else const ConvolveFunctions convolve8_neon( vpx_convolve_copy_neon, vpx_convolve_avg_neon, vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, - vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); -#else // HAVE_NEON -const ConvolveFunctions convolve8_neon( - vpx_convolve_copy_neon, vpx_convolve_avg_neon, vpx_convolve8_horiz_neon, - vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon, - vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon, - vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, - vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); -#endif // HAVE_NEON_ASM + vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0); -const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES(convolve8_neon) }; +const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) }; +#endif // CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_neon)); + ::testing::ValuesIn(kArrayConvolve_neon)); #endif // HAVE_NEON #if HAVE_DSPR2 @@ -1125,10 +1359,22 @@ const ConvolveFunctions convolve8_msa( vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, - vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) }; INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::ValuesIn(kArrayConvolve8_msa)); #endif // HAVE_MSA + +#if HAVE_VSX +const ConvolveFunctions convolve8_vsx( + vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx, + vpx_convolve8_avg_horiz_vsx, vpx_convolve8_vert_vsx, + vpx_convolve8_avg_vert_vsx, vpx_convolve8_vsx, vpx_convolve8_avg_vsx, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); +const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) }; +INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_vsx)); +#endif // HAVE_VSX } // namespace diff --git a/libs/libvpx/test/datarate_test.cc b/libs/libvpx/test/datarate_test.cc index 67c78bc91c..31a8523d21 100644 --- a/libs/libvpx/test/datarate_test.cc +++ b/libs/libvpx/test/datarate_test.cc @@ -20,7 +20,7 @@ namespace { class DatarateTestLarge : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam { + public ::libvpx_test::CodecTestWith2Params { public: DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} @@ -30,6 +30,7 @@ class DatarateTestLarge virtual void SetUp() { InitializeConfig(); SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); ResetModel(); } @@ -42,14 +43,24 @@ class DatarateTestLarge duration_ = 0.0; denoiser_offon_test_ = 0; denoiser_offon_period_ = -1; + gf_boost_ = 0; + use_roi_ = 0; } virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { if (video->frame() == 0) { encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); } +#if CONFIG_VP8_ENCODER + if (use_roi_ == 1) { + encoder->Control(VP8E_SET_ROI_MAP, &roi_); + } +#endif + if (denoiser_offon_test_) { ASSERT_GT(denoiser_offon_period_, 0) << "denoiser_offon_period_ is not positive."; @@ -87,8 +98,8 @@ class DatarateTestLarge const bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; if (!key_frame) { - ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - << pkt->data.frame.pts; + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; } const int64_t frame_size_in_bits = pkt->data.frame.sz * 8; @@ -139,6 +150,10 @@ class DatarateTestLarge int denoiser_on_; int denoiser_offon_test_; int denoiser_offon_period_; + int set_cpu_used_; + int gf_boost_; + int use_roi_; + vpx_roi_map_t roi_; }; #if CONFIG_TEMPORAL_DENOISING @@ -156,9 +171,6 @@ TEST_P(DatarateTestLarge, DenoiserLevels) { // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, // denoiserOnAggressive, and denoiserOnAdaptive. - // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j - // refers to the blur thresholds: 20, 40, 60 80. - // The j = 0 case (denoiser off) is covered in the tests below. denoiser_on_ = j; cfg_.rc_target_bitrate = 300; ResetModel(); @@ -166,7 +178,7 @@ TEST_P(DatarateTestLarge, DenoiserLevels) { ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3) + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) << " The datarate for the file missed the target!"; } } @@ -190,7 +202,7 @@ TEST_P(DatarateTestLarge, DenoiserOffOn) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3) + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) << " The datarate for the file missed the target!"; } #endif // CONFIG_TEMPORAL_DENOISING @@ -221,8 +233,7 @@ TEST_P(DatarateTestLarge, BasicBufferModel) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3) + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) << " The datarate for the file missed the target!"; } } @@ -256,15 +267,142 @@ TEST_P(DatarateTestLarge, ChangingDropFrameThresh) { } } -// Disabled for tsan, see: -// https://bugs.chromium.org/p/webm/issues/detail?id=1049 -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define BUILDING_WITH_TSAN -#endif -#endif -#ifndef BUILDING_WITH_TSAN TEST_P(DatarateTestLarge, DropFramesMultiThreads) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 140); + cfg_.rc_target_bitrate = 200; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; +} + +class DatarateTestRealTime : public DatarateTestLarge { + public: + virtual ~DatarateTestRealTime() {} +}; + +#if CONFIG_TEMPORAL_DENOISING +// Check basic datarate targeting, for a single bitrate, but loop over the +// various denoiser settings. +TEST_P(DatarateTestRealTime, DenoiserLevels) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 140); + for (int j = 1; j < 5; ++j) { + // Run over the denoiser levels. + // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j + // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, + // denoiserOnAggressive, and denoiserOnAdaptive. + denoiser_on_ = j; + cfg_.rc_target_bitrate = 300; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestRealTime, DenoiserOffOn) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 299); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; +} +#endif // CONFIG_TEMPORAL_DENOISING + +TEST_P(DatarateTestRealTime, BasicBufferModel) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // 2 pass cbr datarate control has a bug hidden by the small # of + // frames selected in this encode. The problem is that even if the buffer is + // negative we produce a keyframe on a cutscene, ignoring datarate + // constraints + // TODO(jimbankoski): Fix when issue + // http://bugs.chromium.org/p/webm/issues/detail?id=495 is addressed. + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 140); + + // There is an issue for low bitrates in real-time mode, where the + // effective_datarate slightly overshoots the target bitrate. + // This is same the issue as noted above (#495). + // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100), + // when the issue is resolved. + for (int i = 100; i <= 700; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } +} + +TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_max_quantizer = 36; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.kf_mode = VPX_KF_DISABLED; + + const int frame_count = 40; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, frame_count); + + // Check that the first dropped frame gets earlier and earlier + // as the drop frame threshold is increased. + + const int kDropFrameThreshTestStep = 30; + vpx_codec_pts_t last_drop = frame_count; + for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + } +} + +TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { denoiser_on_ = 0; cfg_.rc_buf_initial_sz = 500; cfg_.rc_dropframe_thresh = 30; @@ -281,10 +419,93 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) { ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3) + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; +} + +TEST_P(DatarateTestRealTime, RegionOfInterest) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 352; + cfg_.g_h = 288; + + ResetModel(); + + // Set ROI parameters + use_roi_ = 1; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 15) / 16; + roi_.cols = (cfg_.g_w + 15) / 16; + + roi_.delta_q[0] = 0; + roi_.delta_q[1] = -20; + roi_.delta_q[2] = 0; + roi_.delta_q[3] = 0; + + roi_.delta_lf[0] = 0; + roi_.delta_lf[1] = -20; + roi_.delta_lf[2] = 0; + roi_.delta_lf[3] = 0; + + roi_.static_threshold[0] = 0; + roi_.static_threshold[1] = 1000; + roi_.static_threshold[2] = 0; + roi_.static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + +TEST_P(DatarateTestRealTime, GFBoost) { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // Apply a gf boost. + gf_boost_ = 50; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) << " The datarate for the file missed the target!"; } -#endif class DatarateTestVP9Large : public ::libvpx_test::EncoderTest, @@ -317,6 +538,7 @@ class DatarateTestVP9Large } denoiser_offon_test_ = 0; denoiser_offon_period_ = -1; + frame_parallel_decoding_mode_ = 1; } // @@ -331,8 +553,8 @@ class DatarateTestVP9Large // 2 6 // 0 4 .... // LAST is always update on base/layer 0, GOLDEN is updated on layer 1. - // For this 3 layer example, the 2nd enhancement layer (layer 2) does not - // update any reference frames. + // For this 3 layer example, the 2nd enhancement layer (layer 2) updates + // the altref frame. int SetFrameFlags(int frame_num, int num_temp_layers) { int frame_flags = 0; if (num_temp_layers == 2) { @@ -354,9 +576,8 @@ class DatarateTestVP9Large // Layer 1: predict from L, G, ARF; update G. frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; } else if ((frame_num - 1) % 2 == 0) { - // Layer 2: predict from L, G, ARF; update none. - frame_flags = - VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + // Layer 2: predict from L, G, ARF; update ARF. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; } } return frame_flags; @@ -396,6 +617,9 @@ class DatarateTestVP9Large } encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1)); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, + frame_parallel_decoding_mode_); if (cfg_.ts_number_layers > 1) { if (video->frame() == 0) { @@ -434,8 +658,8 @@ class DatarateTestVP9Large duration * timebase_ * cfg_.rc_target_bitrate * 1000); // Buffer should not go negative. - ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - << pkt->data.frame.pts; + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; const size_t frame_size_in_bits = pkt->data.frame.sz * 8; @@ -476,6 +700,7 @@ class DatarateTestVP9Large int denoiser_on_; int denoiser_offon_test_; int denoiser_offon_period_; + int frame_parallel_decoding_mode_; }; // Check basic rate targeting for VBR mode with 0 lag. @@ -494,7 +719,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) << " The datarate for the file is greater than target by too much!"; } } @@ -521,7 +746,37 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) + << " The datarate for the file is greater than target by too much!"; + } +} + +// Check basic rate targeting for VBR mode with non-zero lag, with +// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs +// since error_resilience is off. +TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZeroFrameParDecOff) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + // For non-zero lag, rate control will work (be within bounds) for + // real-time mode. + if (deadline_ == VPX_DL_REALTIME) { + cfg_.g_lag_in_frames = 15; + } else { + cfg_.g_lag_in_frames = 0; + } + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + for (int i = 400; i <= 800; i += 400) { + cfg_.rc_target_bitrate = i; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) << " The datarate for the file is greater than target by too much!"; } } @@ -550,6 +805,33 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) { } } +// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode +// off( and error_resilience off). +TEST_P(DatarateTestVP9Large, BasicRateTargetingFrameParDecOff) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 140); + for (int i = 150; i < 800; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; + } +} + // Check basic rate targeting for CBR mode, with 2 threads and dropped frames. TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) { cfg_.rc_buf_initial_sz = 500; @@ -594,7 +876,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting444) { ResetModel(); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(static_cast(cfg_.rc_target_bitrate), - effective_datarate_[0] * 0.85) + effective_datarate_[0] * 0.80) << " The datarate for the file exceeds the target by too much!"; ASSERT_LE(static_cast(cfg_.rc_target_bitrate), effective_datarate_[0] * 1.15) @@ -627,26 +909,29 @@ TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) { 30, 1, 0, 140); const int kDropFrameThreshTestStep = 30; - vpx_codec_pts_t last_drop = 140; - int last_num_drops = 0; - for (int i = 10; i < 100; i += kDropFrameThreshTestStep) { - cfg_.rc_dropframe_thresh = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; - ASSERT_LE(first_drop_, last_drop) - << " The first dropped frame for drop_thresh " << i - << " > first dropped frame for drop_thresh " - << i - kDropFrameThreshTestStep; - ASSERT_GE(num_drops_, last_num_drops * 0.85) - << " The number of dropped frames for drop_thresh " << i - << " < number of dropped frames for drop_thresh " - << i - kDropFrameThreshTestStep; - last_drop = first_drop_; - last_num_drops = num_drops_; + for (int j = 50; j <= 150; j += 100) { + cfg_.rc_target_bitrate = j; + vpx_codec_pts_t last_drop = 140; + int last_num_drops = 0; + for (int i = 10; i < 100; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + << " The datarate for the file is greater than target by too much!"; + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + ASSERT_GE(num_drops_, last_num_drops * 0.85) + << " The number of dropped frames for drop_thresh " << i + << " < number of dropped frames for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + last_num_drops = num_drops_; + } } } @@ -789,8 +1074,13 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) { } #if CONFIG_VP9_TEMPORAL_DENOISING +class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large { + public: + virtual ~DatarateTestVP9LargeDenoiser() {} +}; + // Check basic datarate targeting, for a single bitrate, when denoiser is on. -TEST_P(DatarateTestVP9Large, DenoiserLevels) { +TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -817,9 +1107,67 @@ TEST_P(DatarateTestVP9Large, DenoiserLevels) { << " The datarate for the file is greater than target by too much!"; } +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for clip with high noise level. Use 2 threads. +TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 2; + + ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: kDenoiserOnYOnly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for 1280x720 clip with 4 threads. +TEST_P(DatarateTestVP9LargeDenoiser, 4threads) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 4; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29) + << " The datarate for the file is greater than target by too much!"; +} + // Check basic datarate targeting, for a single bitrate, when denoiser is off // and on. -TEST_P(DatarateTestVP9Large, DenoiserOffOn) { +TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -868,13 +1216,17 @@ class DatarateOnePassCbrSvc } virtual void ResetModel() { last_pts_ = 0; - bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; - frame_number_ = 0; - first_drop_ = 0; - bits_total_ = 0; duration_ = 0.0; mismatch_psnr_ = 0.0; mismatch_nframes_ = 0; + denoiser_on_ = 0; + tune_content_ = 0; + base_speed_setting_ = 5; + spatial_layer_id_ = 0; + temporal_layer_id_ = 0; + memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_)); + memset(bits_total_, 0, sizeof(bits_total_)); + memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_)); } virtual void BeginPassHook(unsigned int /*pass*/) {} virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, @@ -885,45 +1237,114 @@ class DatarateOnePassCbrSvc svc_params_.max_quantizers[i] = 63; svc_params_.min_quantizers[i] = 0; } - svc_params_.speed_per_layer[0] = 5; + svc_params_.speed_per_layer[0] = base_speed_setting_; for (i = 1; i < VPX_SS_MAX_LAYERS; ++i) { - svc_params_.speed_per_layer[i] = 7; + svc_params_.speed_per_layer[i] = speed_setting_; } + + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); encoder->Control(VP9E_SET_SVC, 1); encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); encoder->Control(VP8E_SET_CPUUSED, speed_setting_); encoder->Control(VP9E_SET_TILE_COLUMNS, 0); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300); encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1)); + encoder->Control(VP9E_SET_ROW_MT, 1); + encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1); + encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); } const vpx_rational_t tb = video->timebase(); timebase_ = static_cast(tb.num) / tb.den; duration_ = 0; } + + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + vpx_svc_layer_id_t layer_id; + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + spatial_layer_id_ = layer_id.spatial_layer_id; + temporal_layer_id_ = layer_id.temporal_layer_id; + // Update buffer with per-layer target frame bandwidth, this is done + // for every frame passed to the encoder (encoded or dropped). + // For temporal layers, update the cumulative buffer level. + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_in_buffer_model_[layer] += + static_cast(layer_target_avg_bandwidth_[layer]); + } + } + } + + vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, + uint32_t sizes[8], int *count) { + uint8_t marker; + marker = *(data + data_sz - 1); + *count = 0; + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + { + const uint8_t marker2 = *(data + data_sz - index_sz); + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + { + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; + } + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; - if (last_pts_ == 0) duration = 1; - bits_in_buffer_model_ += static_cast( - duration * timebase_ * cfg_.rc_target_bitrate * 1000); + uint32_t sizes[8] = { 0 }; + int count = 0; + last_pts_ = pkt->data.frame.pts; const bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; - if (!key_frame) { - ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - << pkt->data.frame.pts; + parse_superframe_index(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz, sizes, &count); + ASSERT_EQ(count, number_spatial_layers_); + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + sizes[sl] = sizes[sl] << 3; + // Update the total encoded bits per layer. + // For temporal layers, update the cumulative encoded bits per layer. + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_total_[layer] += static_cast(sizes[sl]); + // Update the per-layer buffer level with the encoded frame size. + bits_in_buffer_model_[layer] -= static_cast(sizes[sl]); + // There should be no buffer underrun, except on the base + // temporal layer, since there may be key frames there. + if (!key_frame && tl > 0) { + ASSERT_GE(bits_in_buffer_model_[layer], 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + } + } } - const size_t frame_size_in_bits = pkt->data.frame.sz * 8; - bits_in_buffer_model_ -= frame_size_in_bits; - bits_total_ += frame_size_in_bits; - if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1; - last_pts_ = pkt->data.frame.pts; - bits_in_last_frame_ = frame_size_in_bits; - ++frame_number_; } + virtual void EndPassHook(void) { - if (bits_total_) { - const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit - duration_ = (last_pts_ + 1) * timebase_; - file_datarate_ = file_size_in_kb / duration_; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = 0; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + const double file_size_in_kb = bits_total_[layer] / 1000.; + duration_ = (last_pts_ + 1) * timebase_; + file_datarate_[layer] = file_size_in_kb / duration_; + } } } @@ -936,26 +1357,35 @@ class DatarateOnePassCbrSvc unsigned int GetMismatchFrames() { return mismatch_nframes_; } vpx_codec_pts_t last_pts_; - int64_t bits_in_buffer_model_; + int64_t bits_in_buffer_model_[VPX_MAX_LAYERS]; double timebase_; - int frame_number_; - vpx_codec_pts_t first_drop_; - int64_t bits_total_; + int64_t bits_total_[VPX_MAX_LAYERS]; double duration_; - double file_datarate_; + double file_datarate_[VPX_MAX_LAYERS]; size_t bits_in_last_frame_; vpx_svc_extra_cfg_t svc_params_; int speed_setting_; double mismatch_psnr_; int mismatch_nframes_; + int denoiser_on_; + int tune_content_; + int base_speed_setting_; + int spatial_layer_id_; + int temporal_layer_id_; + int number_spatial_layers_; + int number_temporal_layers_; + int layer_target_avg_bandwidth_[VPX_MAX_LAYERS]; }; static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg, const vpx_svc_extra_cfg_t *svc_params, int spatial_layers, int temporal_layers, - int temporal_layering_mode) { + int temporal_layering_mode, + int *layer_target_avg_bandwidth, + int64_t *bits_in_buffer_model) { int sl, spatial_layer_target; float total = 0; float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; + float framerate = 30.0; for (sl = 0; sl < spatial_layers; ++sl) { if (svc_params->scaling_factor_den[sl] > 0) { alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 / @@ -975,13 +1405,47 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg, } else if (temporal_layering_mode == 2) { enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3; enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target; + } else if (temporal_layering_mode <= 1) { + enc_cfg->layer_target_bitrate[index] = spatial_layer_target; + } + } + for (sl = 0; sl < spatial_layers; ++sl) { + for (int tl = 0; tl < temporal_layers; ++tl) { + const int layer = sl * temporal_layers + tl; + float layer_framerate = framerate; + if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2; + if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4; + if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2; + layer_target_avg_bandwidth[layer] = static_cast( + enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate); + bits_in_buffer_model[layer] = + enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz; } } } -// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and -// 3 temporal layers. Run CIF clip with 1 thread. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) { +static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg, + int number_spatial_layers, + int number_temporal_layers, + double *file_datarate, + double thresh_overshoot, + double thresh_undershoot) { + for (int sl = 0; sl < number_spatial_layers; ++sl) + for (int tl = 0; tl < number_temporal_layers; ++tl) { + const int layer = sl * number_temporal_layers + tl; + ASSERT_GE(cfg->layer_target_bitrate[layer], + file_datarate[layer] * thresh_overshoot) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(cfg->layer_target_bitrate[layer], + file_datarate[layer] * thresh_undershoot) + << " The datarate for the file is lower than the target by too much!"; + } +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 +// temporal layer, with screen content mode on and same speed setting for all +// layers. +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -990,40 +1454,142 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) { cfg_.rc_end_usage = VPX_CBR; cfg_.g_lag_in_frames = 0; cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; cfg_.g_error_resilient = 1; cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; + cfg_.temporal_layering_mode = 0; svc_params_.scaling_factor_num[0] = 144; svc_params_.scaling_factor_den[0] = 288; svc_params_.scaling_factor_num[1] = 288; svc_params_.scaling_factor_den[1] = 288; cfg_.rc_dropframe_thresh = 10; cfg_.kf_max_dist = 9999; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 500; + ResetModel(); + tune_content_ = 1; + base_speed_setting_ = speed_setting_; + assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); + EXPECT_EQ(static_cast(0), GetMismatchFrames()); +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Run CIF clip with 1 thread. +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 3; + svc_params_.scaling_factor_num[0] = 144; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 288; + svc_params_.scaling_factor_den[1] = 288; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + // TODO(marpan): Check that effective_datarate for each layer hits the // layer target_bitrate. for (int i = 200; i <= 800; i += 200) { cfg_.rc_target_bitrate = i; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // Number of temporal layers > 1, so half of the frames in this SVC pattern + // will be non-reference frame and hence encoder will avoid loopfilter. + // Since frame dropper is off, we can expect 200 (half of the sequence) + // mismatched frames. + EXPECT_EQ(static_cast(200), GetMismatchFrames()); +#endif + } +} + +// Check basic rate targeting for 1 pass CBR SVC with denoising. +// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads. +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 2; + cfg_.temporal_layering_mode = 3; + svc_params_.scaling_factor_num[0] = 144; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 288; + svc_params_.scaling_factor_den[1] = 288; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + // TODO(marpan): Check that effective_datarate for each layer hits the + // layer target_bitrate. + // For SVC, noise_sen = 1 means denoising only the top spatial layer + // noise_sen = 2 means denoising the two top spatial layers. + for (int noise_sen = 1; noise_sen <= 2; noise_sen++) { + for (int i = 600; i <= 1000; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + denoiser_on_ = noise_sen; + assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // Number of temporal layers > 1, so half of the frames in this SVC + // pattern + // will be non-reference frame and hence encoder will avoid loopfilter. + // Since frame dropper is off, we can expect 200 (half of the sequence) + // mismatched frames. + EXPECT_EQ(static_cast(200), GetMismatchFrames()); +#endif + } } } // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) { +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -1044,28 +1610,29 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) { svc_params_.scaling_factor_num[1] = 288; svc_params_.scaling_factor_den[1] = 288; cfg_.rc_dropframe_thresh = 10; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); cfg_.rc_target_bitrate = 400; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); // For this 3 temporal layer case, pattern repeats every 4 frames, so choose // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). for (int j = 64; j <= 67; j++) { cfg_.kf_max_dist = j; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, + 1.15); } } // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and // 3 temporal layers. Run HD clip with 4 threads. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) { +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -1085,25 +1652,31 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) { svc_params_.scaling_factor_den[0] = 288; svc_params_.scaling_factor_num[1] = 288; svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 10; + cfg_.rc_dropframe_thresh = 0; cfg_.kf_max_dist = 9999; - ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30, - 1, 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); +#if CONFIG_VP9_DECODER + // Number of temporal layers > 1, so half of the frames in this SVC pattern + // will be non-reference frame and hence encoder will avoid loopfilter. + // Since frame dropper is off, we can expect 30 (half of the sequence) + // mismatched frames. + EXPECT_EQ(static_cast(30), GetMismatchFrames()); +#endif } // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and // 3 temporal layers. Run CIF clip with 1 thread. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) { +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -1125,25 +1698,32 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) { svc_params_.scaling_factor_den[1] = 288; svc_params_.scaling_factor_num[2] = 288; svc_params_.scaling_factor_den[2] = 288; - cfg_.rc_dropframe_thresh = 10; + cfg_.rc_dropframe_thresh = 0; cfg_.kf_max_dist = 9999; - ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30, - 1, 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); +#if CONFIG_VP9_DECODER + // Number of temporal layers > 1, so half of the frames in this SVC pattern + // will be non-reference frame and hence encoder will avoid loopfilter. + // Since frame dropper is off, we can expect 200 (half of the sequence) + // mismatched frames. + EXPECT_EQ(static_cast(200), GetMismatchFrames()); +#endif } // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) { +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -1166,28 +1746,29 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) { svc_params_.scaling_factor_num[2] = 288; svc_params_.scaling_factor_den[2] = 288; cfg_.rc_dropframe_thresh = 10; - ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30, - 1, 0, 300); cfg_.rc_target_bitrate = 800; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); // For this 3 temporal layer case, pattern repeats every 4 frames, so choose // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). for (int j = 32; j <= 35; j++) { cfg_.kf_max_dist = j; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, + 1.15); } } // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and // 3 temporal layers. Run HD clip with 4 threads. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) { +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; @@ -1209,27 +1790,86 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) { svc_params_.scaling_factor_den[1] = 288; svc_params_.scaling_factor_num[2] = 288; svc_params_.scaling_factor_den[2] = 288; - cfg_.rc_dropframe_thresh = 10; + cfg_.rc_dropframe_thresh = 0; cfg_.kf_max_dist = 9999; - ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30, - 1, 0, 300); + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22) - << " The datarate for the file is lower than the target by too much!"; + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); +#if CONFIG_VP9_DECODER + // Number of temporal layers > 1, so half of the frames in this SVC pattern + // will be non-reference frame and hence encoder will avoid loopfilter. + // Since frame dropper is off, we can expect 30 (half of the sequence) + // mismatched frames. + EXPECT_EQ(static_cast(30), GetMismatchFrames()); +#endif +} + +// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial +// downscale 5x5. +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 3; + cfg_.temporal_layering_mode = 0; + svc_params_.scaling_factor_num[0] = 256; + svc_params_.scaling_factor_den[0] = 1280; + svc_params_.scaling_factor_num[1] = 1280; + svc_params_.scaling_factor_den[1] = 1280; + cfg_.rc_dropframe_thresh = 10; + cfg_.kf_max_dist = 999999; + cfg_.kf_min_dist = 0; + cfg_.ss_target_bitrate[0] = 300; + cfg_.ss_target_bitrate[1] = 1400; + cfg_.layer_target_bitrate[0] = 300; + cfg_.layer_target_bitrate[1] = 1400; + cfg_.rc_target_bitrate = 1700; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ResetModel(); + layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30; + bits_in_buffer_model_[0] = + cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz; + layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30; + bits_in_buffer_model_[1] = + cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); EXPECT_EQ(static_cast(0), GetMismatchFrames()); } -VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES); +VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES, + ::testing::Values(0)); +VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Values(-6, -12)); VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large, ::testing::Values(::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), ::testing::Range(2, 9)); +#if CONFIG_VP9_TEMPORAL_DENOISING +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 9)); +#endif VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc, ::testing::Values(::libvpx_test::kRealTime), ::testing::Range(5, 9)); diff --git a/libs/libvpx/test/dct16x16_test.cc b/libs/libvpx/test/dct16x16_test.cc index f9745ed81f..ce0bd37b3d 100644 --- a/libs/libvpx/test/dct16x16_test.cc +++ b/libs/libvpx/test/dct16x16_test.cc @@ -255,11 +255,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride, #if CONFIG_VP9_HIGHBITDEPTH void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_c(in, out, stride, 10); + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_c(in, out, stride, 12); + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride, @@ -273,36 +273,36 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride, } void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } #if HAVE_SSE2 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_c(in, out, stride, 10); + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_c(in, out, stride, 12); + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10); + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12); + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10); + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12); + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH @@ -353,7 +353,7 @@ class Trans16x16TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -475,10 +475,10 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_)); #if CONFIG_VP9_HIGHBITDEPTH } else { - inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_, + inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_, tx_type_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } if (bit_depth_ == VPX_BITS_8) { @@ -530,8 +530,7 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16)); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16)); #endif // CONFIG_VP9_HIGHBITDEPTH } @@ -543,8 +542,8 @@ class Trans16x16TestBase { const uint32_t diff = dst[j] - src[j]; #endif // CONFIG_VP9_HIGHBITDEPTH const uint32_t error = diff * diff; - EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error - << " at index " << j; + EXPECT_GE(1u, error) + << "Error: 16x16 IDCT has error " << error << " at index " << j; } } } @@ -585,9 +584,9 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); } else { #if CONFIG_VP9_HIGHBITDEPTH - ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_); + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif // CONFIG_VP9_HIGHBITDEPTH } @@ -745,66 +744,6 @@ TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } -class PartialTrans16x16Test : public ::testing::TestWithParam< - std::tr1::tuple > { - public: - virtual ~PartialTrans16x16Test() {} - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - bit_depth_ = GET_PARAM(1); - } - - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - vpx_bit_depth_t bit_depth_; - FdctFunc fwd_txfm_; -}; - -TEST_P(PartialTrans16x16Test, Extremes) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - const int minval = -maxval; - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]); -} - -TEST_P(PartialTrans16x16Test, Random) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - ACMRandom rnd(ACMRandom::DeterministicSeed()); - - int sum = 0; - for (int i = 0; i < kNumCoeffs; ++i) { - const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1); - input[i] = val; - sum += val; - } - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ(sum >> 1, output[0]); -} - using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH @@ -837,11 +776,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - C, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8), - make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12))); #else INSTANTIATE_TEST_CASE_P( C, Trans16x16HT, @@ -850,17 +784,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_c, - VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, Trans16x16DCT, - ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon, - 0, VPX_BITS_8))); -#endif + ::testing::Values(make_tuple(&vpx_fdct16x16_neon, + &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8))); +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( @@ -877,9 +808,6 @@ INSTANTIATE_TEST_CASE_P( 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -914,9 +842,6 @@ INSTANTIATE_TEST_CASE_P( &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12), make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -932,8 +857,12 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa, - VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_c, + &vpx_idct16x16_256_add_vsx, + 0, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libs/libvpx/test/dct32x32_test.cc b/libs/libvpx/test/dct32x32_test.cc index a168e690ee..a95ff97328 100644 --- a/libs/libvpx/test/dct32x32_test.cc +++ b/libs/libvpx/test/dct32x32_test.cc @@ -71,11 +71,11 @@ typedef std::tr1::tuple #if CONFIG_VP9_HIGHBITDEPTH void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10); + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12); + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -137,7 +137,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32)); + inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32)); #endif } @@ -275,7 +275,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) { ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32)); + ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32)); #endif } for (int j = 0; j < kNumCoeffs; ++j) { @@ -292,67 +292,6 @@ TEST_P(Trans32x32Test, InverseAccuracy) { } } -class PartialTrans32x32Test - : public ::testing::TestWithParam< - std::tr1::tuple > { - public: - virtual ~PartialTrans32x32Test() {} - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - bit_depth_ = GET_PARAM(1); - } - - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - vpx_bit_depth_t bit_depth_; - FwdTxfmFunc fwd_txfm_; -}; - -TEST_P(PartialTrans32x32Test, Extremes) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - const int minval = -maxval; - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]); -} - -TEST_P(PartialTrans32x32Test, Random) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - ACMRandom rnd(ACMRandom::DeterministicSeed()); - - int sum = 0; - for (int i = 0; i < kNumCoeffs; ++i) { - const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1); - input[i] = val; - sum += val; - } - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ(sum >> 3, output[0]); -} - using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH @@ -366,11 +305,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - C, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8), - make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12))); #else INSTANTIATE_TEST_CASE_P( C, Trans32x32Test, @@ -378,19 +312,16 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, - VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, Trans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon, - 0, VPX_BITS_8), - make_tuple(&vpx_fdct32x32_rd_c, + ::testing::Values(make_tuple(&vpx_fdct32x32_neon, + &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_neon, &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8))); -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( @@ -399,9 +330,6 @@ INSTANTIATE_TEST_CASE_P( &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -418,9 +346,6 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -439,8 +364,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_msa, &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, - VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P( + VSX, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_c, + &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libs/libvpx/test/dct_partial_test.cc b/libs/libvpx/test/dct_partial_test.cc new file mode 100644 index 0000000000..4d145f5891 --- /dev/null +++ b/libs/libvpx/test/dct_partial_test.cc @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; +using std::tr1::tuple; +using std::tr1::make_tuple; + +namespace { +typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride); + +typedef tuple + PartialFdctParam; + +tran_low_t partial_fdct_ref(const Buffer &in, int size) { + int64_t sum = 0; + for (int y = 0; y < size; ++y) { + for (int x = 0; x < size; ++x) { + sum += in.TopLeftPixel()[y * in.stride() + x]; + } + } + + switch (size) { + case 4: sum *= 2; break; + case 8: /*sum = sum;*/ break; + case 16: sum >>= 1; break; + case 32: sum >>= 3; break; + } + + return static_cast(sum); +} + +class PartialFdctTest : public ::testing::TestWithParam { + public: + PartialFdctTest() { + fwd_txfm_ = GET_PARAM(0); + size_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void RunTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int16_t maxvalue = + clip_pixel_highbd(std::numeric_limits::max(), bit_depth_); + const int16_t minvalue = -maxvalue; + Buffer input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < 100; ++i) { + if (i == 0) { + input_block.Set(maxvalue); + } else if (i == 1) { + input_block.Set(minvalue); + } else { + input_block.Set(&rnd, minvalue, maxvalue); + } + + ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(), + output_block.TopLeftPixel(), + input_block.stride())); + + EXPECT_EQ(partial_fdct_ref(input_block, size_), + output_block.TopLeftPixel()[0]); + } + } + + PartialFdctFunc fwd_txfm_; + vpx_bit_depth_t bit_depth_; + int size_; +}; + +TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); } + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10), + make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10), + make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + C, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_MSA +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P(MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8, + VPX_BITS_8))); +#else // !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA +} // namespace diff --git a/libs/libvpx/test/dct_test.cc b/libs/libvpx/test/dct_test.cc new file mode 100644 index 0000000000..addbdfb463 --- /dev/null +++ b/libs/libvpx/test/dct_test.cc @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; +using std::tr1::tuple; +using std::tr1::make_tuple; + +namespace { +typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); +typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); +typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +typedef void (*FhtFuncRef)(const Buffer &in, Buffer *out, + int size, int tx_type); +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); + +/* forward transform, inverse transform, size, transform type, bit depth */ +typedef tuple DctParam; +typedef tuple HtParam; + +void fdct_ref(const Buffer &in, Buffer *out, int size, + int /*tx_type*/) { + const int16_t *i = in.TopLeftPixel(); + const int i_stride = in.stride(); + tran_low_t *o = out->TopLeftPixel(); + if (size == 4) { + vpx_fdct4x4_c(i, o, i_stride); + } else if (size == 8) { + vpx_fdct8x8_c(i, o, i_stride); + } else if (size == 16) { + vpx_fdct16x16_c(i, o, i_stride); + } else if (size == 32) { + vpx_fdct32x32_c(i, o, i_stride); + } +} + +void fht_ref(const Buffer &in, Buffer *out, int size, + int tx_type) { + const int16_t *i = in.TopLeftPixel(); + const int i_stride = in.stride(); + tran_low_t *o = out->TopLeftPixel(); + if (size == 4) { + vp9_fht4x4_c(i, o, i_stride, tx_type); + } else if (size == 8) { + vp9_fht8x8_c(i, o, i_stride, tx_type); + } else if (size == 16) { + vp9_fht16x16_c(i, o, i_stride, tx_type); + } +} + +void fwht_ref(const Buffer &in, Buffer *out, int size, + int /*tx_type*/) { + ASSERT_EQ(size, 4); + vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); +} + +#if CONFIG_VP9_HIGHBITDEPTH +#define idctNxN(n, coeffs, bitdepth) \ + void idct##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out, \ + int stride) { \ + vpx_highbd_idct##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \ + stride, bitdepth); \ + } + +idctNxN(4, 16, 10); +idctNxN(4, 16, 12); +idctNxN(8, 64, 10); +idctNxN(8, 64, 12); +idctNxN(16, 256, 10); +idctNxN(16, 256, 12); +idctNxN(32, 1024, 10); +idctNxN(32, 1024, 12); + +#define ihtNxN(n, coeffs, bitdepth) \ + void iht##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out, \ + int stride, int tx_type) { \ + vp9_highbd_iht##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \ + stride, tx_type, bitdepth); \ + } + +ihtNxN(4, 16, 10); +ihtNxN(4, 16, 12); +ihtNxN(8, 64, 10); +ihtNxN(8, 64, 12); +ihtNxN(16, 256, 10); +// ihtNxN(16, 256, 12); + +void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); +} + +void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +class TransTestBase { + public: + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + virtual void RunFwdTxfm(const Buffer &in, + Buffer *out) = 0; + + virtual void RunInvTxfm(const Buffer &in, uint8_t *out) = 0; + + void RunAccuracyCheck(int limit) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + Buffer test_input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(test_input_block.Init()); + Buffer test_temp_block = + Buffer(size_, size_, 0, 16); + ASSERT_TRUE(test_temp_block.Init()); + Buffer dst = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(dst.Init()); + Buffer src = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(src.Init()); +#if CONFIG_VP9_HIGHBITDEPTH + Buffer dst16 = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(dst16.Init()); + Buffer src16 = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(src16.Init()); +#endif // CONFIG_VP9_HIGHBITDEPTH + uint32_t max_error = 0; + int64_t total_error = 0; + const int count_test_block = 10000; + for (int i = 0; i < count_test_block; ++i) { + if (bit_depth_ == 8) { + src.Set(&rnd, &ACMRandom::Rand8); + dst.Set(&rnd, &ACMRandom::Rand8); + // Initialize a test block with input range [-255, 255]. + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src.TopLeftPixel()[h * src.stride() + w] - + dst.TopLeftPixel()[h * dst.stride() + w]; + } + } +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16.Set(&rnd, 0, max_pixel_value_); + dst16.Set(&rnd, 0, max_pixel_value_); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src16.TopLeftPixel()[h * src16.stride() + w] - + dst16.TopLeftPixel()[h * dst16.stride() + w]; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block)); + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(test_temp_block, dst.TopLeftPixel())); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16.TopLeftPixel()))); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + int diff; +#if CONFIG_VP9_HIGHBITDEPTH + if (bit_depth_ != 8) { + diff = dst16.TopLeftPixel()[h * dst16.stride() + w] - + src16.TopLeftPixel()[h * src16.stride() + w]; + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + diff = dst.TopLeftPixel()[h * dst.stride() + w] - + src.TopLeftPixel()[h * src.stride() + w]; +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + } + + EXPECT_GE(static_cast(limit), max_error) + << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit; + + EXPECT_GE(count_test_block * limit, total_error) + << "Error: 4x4 FHT/IHT has average round trip error > " << limit + << " per block"; + } + + void RunCoeffCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + Buffer input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_block.Init()); + Buffer output_ref_block = Buffer(size_, size_, 0); + ASSERT_TRUE(output_ref_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-max_pixel_value_, + // max_pixel_value_]. + input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_); + + fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block)); + + // The minimum quant value is 4. + EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + output_block.PrintDifference(output_ref_block); + return; + } + } + } + + void RunMemCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + Buffer input_extreme_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_extreme_block.Init()); + Buffer output_ref_block = Buffer(size_, size_, 0); + ASSERT_TRUE(output_ref_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with -max_pixel_value_ or max_pixel_value_. + if (i == 0) { + input_extreme_block.Set(max_pixel_value_); + } else if (i == 1) { + input_extreme_block.Set(-max_pixel_value_); + } else { + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + input_extreme_block + .TopLeftPixel()[h * input_extreme_block.stride() + w] = + rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_; + } + } + } + + fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block)); + + // The minimum quant value is 4. + EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + EXPECT_GE( + 4 * DCT_MAX_VALUE << (bit_depth_ - 8), + abs(output_block.TopLeftPixel()[h * output_block.stride() + w])) + << "Error: 4x4 FDCT has coefficient larger than " + "4*DCT_MAX_VALUE" + << " at " << w << "," << h; + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + output_block.DumpBuffer(); + return; + } + } + } + } + } + + void RunInvAccuracyCheck(int limit) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + Buffer in = Buffer(size_, size_, 4); + ASSERT_TRUE(in.Init()); + Buffer coeff = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + Buffer dst = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(dst.Init()); + Buffer src = Buffer(size_, size_, 0); + ASSERT_TRUE(src.Init()); + Buffer dst16 = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(dst16.Init()); + Buffer src16 = Buffer(size_, size_, 0); + ASSERT_TRUE(src16.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-max_pixel_value_, + // max_pixel_value_]. + if (bit_depth_ == VPX_BITS_8) { + src.Set(&rnd, &ACMRandom::Rand8); + dst.Set(&rnd, &ACMRandom::Rand8); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + in.TopLeftPixel()[h * in.stride() + w] = + src.TopLeftPixel()[h * src.stride() + w] - + dst.TopLeftPixel()[h * dst.stride() + w]; + } + } +#if CONFIG_VP9_HIGHBITDEPTH + } else { + src16.Set(&rnd, 0, max_pixel_value_); + dst16.Set(&rnd, 0, max_pixel_value_); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + in.TopLeftPixel()[h * in.stride() + w] = + src16.TopLeftPixel()[h * src16.stride() + w] - + dst16.TopLeftPixel()[h * dst16.stride() + w]; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + fwd_txfm_ref(in, &coeff, size_, tx_type_); + + if (bit_depth_ == VPX_BITS_8) { + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst.TopLeftPixel())); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16.TopLeftPixel()))); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + int diff; +#if CONFIG_VP9_HIGHBITDEPTH + if (bit_depth_ != 8) { + diff = dst16.TopLeftPixel()[h * dst16.stride() + w] - + src16.TopLeftPixel()[h * src16.stride() + w]; + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + diff = dst.TopLeftPixel()[h * dst.stride() + w] - + src.TopLeftPixel()[h * src.stride() + w]; +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_GE(static_cast(limit), error) + << "Error: " << size_ << "x" << size_ << " IDCT has error " + << error << " at " << w << "," << h; + } + } + } + } + + FhtFuncRef fwd_txfm_ref; + vpx_bit_depth_t bit_depth_; + int tx_type_; + int max_pixel_value_; + int size_; +}; + +class TransDCT : public TransTestBase, + public ::testing::TestWithParam { + public: + TransDCT() { + fwd_txfm_ref = fdct_ref; + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + size_ = GET_PARAM(2); + tx_type_ = GET_PARAM(3); + bit_depth_ = GET_PARAM(4); + max_pixel_value_ = (1 << bit_depth_) - 1; + } + + protected: + void RunFwdTxfm(const Buffer &in, Buffer *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); + } + + void RunInvTxfm(const Buffer &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, in.stride()); + } + + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; +}; + +TEST_P(TransDCT, AccuracyCheck) { RunAccuracyCheck(1); } + +TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransDCT, MemCheck) { RunMemCheck(); } + +TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, TransDCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 32, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_12, 32, 0, VPX_BITS_10), + make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 16, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 16, 0, VPX_BITS_10), + make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 8, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 8, 0, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 4, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 4, 0, VPX_BITS_12), + make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + C, TransDCT, + ::testing::Values( + make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +#if !CONFIG_EMULATE_HARDWARE +#if CONFIG_VP9_HIGHBITDEPTH +/* TODO:(johannkoenig) Determine why these fail AccuracyCheck + make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 32, 0, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 16, 0, VPX_BITS_12), +*/ +INSTANTIATE_TEST_CASE_P( + SSE2, TransDCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 32, 0, + VPX_BITS_10), + make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 16, 0, + VPX_BITS_10), + make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_10, 8, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_12, 8, 0, VPX_BITS_12), + make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, 0, + VPX_BITS_8), + make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10, 4, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12, 4, 0, VPX_BITS_12), + make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, 0, + VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + SSE2, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct32x32_sse2, + &vpx_idct32x32_1024_add_sse2, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct16x16_sse2, + &vpx_idct16x16_256_add_sse2, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, + 0, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // !CONFIG_EMULATE_HARDWARE +#endif // HAVE_SSE2 + +#if !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE +#if !ARCH_X86_64 +// TODO(johannkoenig): high bit depth fdct8x8. +INSTANTIATE_TEST_CASE_P( + SSSE3, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2, + 32, 0, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0, + VPX_BITS_8))); +#else +// vpx_fdct8x8_ssse3 is only available in 64 bit builds. +INSTANTIATE_TEST_CASE_P( + SSSE3, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2, + 32, 0, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2, + 8, 0, VPX_BITS_8))); +#endif // !ARCH_X86_64 +#endif // HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE +#endif // !CONFIG_VP9_HIGHBITDEPTH + +#if !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE +// TODO(johannkoenig): high bit depth fdct32x32. +INSTANTIATE_TEST_CASE_P( + AVX2, TransDCT, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2, + &vpx_idct32x32_1024_add_sse2, + 32, 0, VPX_BITS_8))); + +#endif // !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON +#if !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P( + NEON, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct32x32_neon, + &vpx_idct32x32_1024_add_neon, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct16x16_neon, + &vpx_idct16x16_256_add_neon, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_neon, &vpx_idct4x4_16_add_neon, 4, + 0, VPX_BITS_8))); +#endif // !CONFIG_EMULATE_HARDWARE +#endif // HAVE_NEON + +#if HAVE_MSA +#if !CONFIG_VP9_HIGHBITDEPTH +#if !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P( + MSA, TransDCT, + ::testing::Values( + make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 32, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, 16, 0, + VPX_BITS_8), + make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 8, 0, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 4, 0, + VPX_BITS_8))); +#endif // !CONFIG_EMULATE_HARDWARE +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P(VSX, TransDCT, + ::testing::Values(make_tuple(&vpx_fdct4x4_c, + &vpx_idct4x4_16_add_vsx, 4, + 0, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +class TransHT : public TransTestBase, public ::testing::TestWithParam { + public: + TransHT() { + fwd_txfm_ref = fht_ref; + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + size_ = GET_PARAM(2); + tx_type_ = GET_PARAM(3); + bit_depth_ = GET_PARAM(4); + max_pixel_value_ = (1 << bit_depth_) - 1; + } + + protected: + void RunFwdTxfm(const Buffer &in, Buffer *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_); + } + + void RunInvTxfm(const Buffer &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, in.stride(), tx_type_); + } + + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; +}; + +TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); } + +TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransHT, MemCheck) { RunMemCheck(); } + +TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } + +/* TODO:(johannkoenig) Determine why these fail AccuracyCheck + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 0, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 1, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 2, VPX_BITS_12), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 3, VPX_BITS_12), + */ +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, TransHT, + ::testing::Values( + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 1, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 2, VPX_BITS_10), + make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 3, VPX_BITS_10), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 1, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 2, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 3, VPX_BITS_10), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 0, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 1, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 2, VPX_BITS_12), + make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 3, VPX_BITS_12), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 1, VPX_BITS_10), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 2, VPX_BITS_10), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 3, VPX_BITS_10), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 0, VPX_BITS_12), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 1, VPX_BITS_12), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 2, VPX_BITS_12), + make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 3, VPX_BITS_12), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P( + C, TransHT, + ::testing::Values( + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8), + make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8), + + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8), + + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8), + make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, TransHT, + ::testing::Values( + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 0, + VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 1, + VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 2, + VPX_BITS_8), + make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 3, + VPX_BITS_8), + + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 0, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 1, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 2, VPX_BITS_8), + make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 3, VPX_BITS_8), + + make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 0, VPX_BITS_8), + make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 1, VPX_BITS_8), + make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 2, VPX_BITS_8), + make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 3, + VPX_BITS_8))); +#endif // HAVE_SSE2 + +class TransWHT : public TransTestBase, + public ::testing::TestWithParam { + public: + TransWHT() { + fwd_txfm_ref = fwht_ref; + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + size_ = GET_PARAM(2); + tx_type_ = GET_PARAM(3); + bit_depth_ = GET_PARAM(4); + max_pixel_value_ = (1 << bit_depth_) - 1; + } + + protected: + void RunFwdTxfm(const Buffer &in, Buffer *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); + } + + void RunInvTxfm(const Buffer &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, in.stride()); + } + + FdctFunc fwd_txfm_; + IdctFunc inv_txfm_; +}; + +TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); } + +TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransWHT, MemCheck) { RunMemCheck(); } + +TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + C, TransWHT, + ::testing::Values( + make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 4, 0, VPX_BITS_10), + make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 4, 0, VPX_BITS_12), + make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 4, 0, VPX_BITS_8))); +#else +INSTANTIATE_TEST_CASE_P(C, TransWHT, + ::testing::Values(make_tuple(&vp9_fwht4x4_c, + &vpx_iwht4x4_16_add_c, 4, + 0, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, TransWHT, + ::testing::Values(make_tuple(&vp9_fwht4x4_sse2, + &vpx_iwht4x4_16_add_sse2, + 4, 0, VPX_BITS_8))); +#endif // HAVE_SSE2 +} // namespace diff --git a/libs/libvpx/test/decode_api_test.cc b/libs/libvpx/test/decode_api_test.cc index 593637780e..4167cf3e0f 100644 --- a/libs/libvpx/test/decode_api_test.cc +++ b/libs/libvpx/test/decode_api_test.cc @@ -172,4 +172,21 @@ TEST(DecodeAPI, Vp9PeekSI) { } #endif // CONFIG_VP9_DECODER +TEST(DecodeAPI, HighBitDepthCapability) { +// VP8 should not claim VP9 HBD as a capability. +#if CONFIG_VP8_DECODER + const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_dx_algo); + EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif + +#if CONFIG_VP9_DECODER + const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_dx_algo); +#if CONFIG_VP9_HIGHBITDEPTH + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH); +#else + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif +#endif +} + } // namespace diff --git a/libs/libvpx/test/decode_svc_test.cc b/libs/libvpx/test/decode_svc_test.cc new file mode 100644 index 0000000000..69f62f13bd --- /dev/null +++ b/libs/libvpx/test/decode_svc_test.cc @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/ivf_video_source.h" +#include "test/test_vectors.h" +#include "test/util.h" + +namespace { + +const unsigned int kNumFrames = 19; + +class DecodeSvcTest : public ::libvpx_test::DecoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {} + virtual ~DecodeSvcTest() {} + + virtual void PreDecodeFrameHook( + const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) { + if (video.frame_number() == 0) + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_); + } + + virtual void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) { + ASSERT_EQ(img.d_w, width_); + ASSERT_EQ(img.d_h, height_); + total_frames_ = frame_number; + } + + int spatial_layer_; + unsigned int width_; + unsigned int height_; + unsigned int total_frames_; +}; + +// SVC test vector is 1280x720, with 3 spatial layers, and 20 frames. + +// Decode the SVC test vector, which has 3 spatial layers, and decode up to +// spatial layer 0. Verify the resolution of each decoded frame and the total +// number of frames decoded. This results in 1/4x1/4 resolution (320x180). +TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) { + const std::string filename = GET_PARAM(1); + testing::internal::scoped_ptr video; + video.reset(new libvpx_test::IVFVideoSource(filename)); + ASSERT_TRUE(video.get() != NULL); + video->Init(); + total_frames_ = 0; + spatial_layer_ = 0; + width_ = 320; + height_ = 180; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + ASSERT_EQ(total_frames_, kNumFrames); +} + +// Decode the SVC test vector, which has 3 spatial layers, and decode up to +// spatial layer 1. Verify the resolution of each decoded frame and the total +// number of frames decoded. This results in 1/2x1/2 resolution (640x360). +TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) { + const std::string filename = GET_PARAM(1); + testing::internal::scoped_ptr video; + video.reset(new libvpx_test::IVFVideoSource(filename)); + ASSERT_TRUE(video.get() != NULL); + video->Init(); + total_frames_ = 0; + spatial_layer_ = 1; + width_ = 640; + height_ = 360; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + ASSERT_EQ(total_frames_, kNumFrames); +} + +// Decode the SVC test vector, which has 3 spatial layers, and decode up to +// spatial layer 2. Verify the resolution of each decoded frame and the total +// number of frames decoded. This results in the full resolution (1280x720). +TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) { + const std::string filename = GET_PARAM(1); + testing::internal::scoped_ptr video; + video.reset(new libvpx_test::IVFVideoSource(filename)); + ASSERT_TRUE(video.get() != NULL); + video->Init(); + total_frames_ = 0; + spatial_layer_ = 2; + width_ = 1280; + height_ = 720; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + ASSERT_EQ(total_frames_, kNumFrames); +} + +// Decode the SVC test vector, which has 3 spatial layers, and decode up to +// spatial layer 10. Verify the resolution of each decoded frame and the total +// number of frames decoded. This is beyond the number of spatial layers, so +// the decoding should result in the full resolution (1280x720). +TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) { + const std::string filename = GET_PARAM(1); + testing::internal::scoped_ptr video; + video.reset(new libvpx_test::IVFVideoSource(filename)); + ASSERT_TRUE(video.get() != NULL); + video->Init(); + total_frames_ = 0; + spatial_layer_ = 10; + width_ = 1280; + height_ = 720; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + ASSERT_EQ(total_frames_, kNumFrames); +} + +VP9_INSTANTIATE_TEST_CASE( + DecodeSvcTest, ::testing::ValuesIn(libvpx_test::kVP9TestVectorsSvc, + libvpx_test::kVP9TestVectorsSvc + + libvpx_test::kNumVP9TestVectorsSvc)); +} // namespace diff --git a/libs/libvpx/test/decode_test_driver.cc b/libs/libvpx/test/decode_test_driver.cc index b738e0db1c..48680eb8e9 100644 --- a/libs/libvpx/test/decode_test_driver.cc +++ b/libs/libvpx/test/decode_test_driver.cc @@ -53,13 +53,13 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder, * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first * frame, which must be a keyframe. */ if (video->frame_number() == 0) - ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: " - << vpx_codec_err_to_string(res_peek); + ASSERT_EQ(VPX_CODEC_OK, res_peek) + << "Peek return failed: " << vpx_codec_err_to_string(res_peek); } else { /* The Vp9 implementation of PeekStream returns an error only if the * data passed to it isn't a valid Vp9 chunk. */ - ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: " - << vpx_codec_err_to_string(res_peek); + ASSERT_EQ(VPX_CODEC_OK, res_peek) + << "Peek return failed: " << vpx_codec_err_to_string(res_peek); } } diff --git a/libs/libvpx/test/encode_api_test.cc b/libs/libvpx/test/encode_api_test.cc index 419e38506c..87e29b61db 100644 --- a/libs/libvpx/test/encode_api_test.cc +++ b/libs/libvpx/test/encode_api_test.cc @@ -62,4 +62,134 @@ TEST(EncodeAPI, InvalidParams) { } } +TEST(EncodeAPI, HighBitDepthCapability) { +// VP8 should not claim VP9 HBD as a capability. +#if CONFIG_VP8_ENCODER + const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_cx_algo); + EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif + +#if CONFIG_VP9_ENCODER + const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_cx_algo); +#if CONFIG_VP9_HIGHBITDEPTH + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH); +#else + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif +#endif +} + +#if CONFIG_VP8_ENCODER +TEST(EncodeAPI, ImageSizeSetting) { + const int width = 711; + const int height = 360; + const int bps = 12; + vpx_image_t img; + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + uint8_t *img_buf = reinterpret_cast( + calloc(width * height * bps / 8, sizeof(*img_buf))); + vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0); + + cfg.g_w = width; + cfg.g_h = height; + + vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf); + + vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0)); + + free(img_buf); + + vpx_codec_destroy(&enc); +} +#endif + +// Set up 2 spatial streams with 2 temporal layers per stream, and generate +// invalid configuration by setting the temporal layer rate allocation +// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of +// CONFIG_MULTI_RES_ENCODING. +TEST(EncodeAPI, MultiResEncode) { + static const vpx_codec_iface_t *kCodecs[] = { +#if CONFIG_VP8_ENCODER + &vpx_codec_vp8_cx_algo, +#endif +#if CONFIG_VP9_ENCODER + &vpx_codec_vp9_cx_algo, +#endif + }; + const int width = 1280; + const int height = 720; + const int width_down = width / 2; + const int height_down = height / 2; + const int target_bitrate = 1000; + const int framerate = 30; + + for (int c = 0; c < NELEMENTS(kCodecs); ++c) { + const vpx_codec_iface_t *const iface = kCodecs[c]; + vpx_codec_ctx_t enc[2]; + vpx_codec_enc_cfg_t cfg[2]; + vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } }; + + memset(enc, 0, sizeof(enc)); + + for (int i = 0; i < 2; i++) { + vpx_codec_enc_config_default(iface, &cfg[i], 0); + } + + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 2; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 100; + cfg[0].rc_overshoot_pct = 15; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + cfg[0].kf_mode = VPX_KF_AUTO; + cfg[0].kf_min_dist = 3000; + cfg[0].kf_max_dist = 3000; + + cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + memcpy(&cfg[1], &cfg[0], sizeof(cfg[0])); + cfg[1].rc_target_bitrate = 500; + cfg[1].g_w = width_down; + cfg[1].g_h = height_down; + + for (int i = 0; i < 2; i++) { + cfg[i].ts_number_layers = 2; + cfg[i].ts_periodicity = 2; + cfg[i].ts_rate_decimator[0] = 2; + cfg[i].ts_rate_decimator[1] = 1; + cfg[i].ts_layer_id[0] = 0; + cfg[i].ts_layer_id[1] = 1; + // Invalid parameters. + cfg[i].ts_target_bitrate[0] = 0; + cfg[i].ts_target_bitrate[1] = 0; + } + + // VP9 should report incapable, VP8 invalid for all configurations. + const char kVP9Name[] = "WebM Project VP9"; + const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface), + sizeof(kVP9Name) - 1) == 0; + EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0])); + + for (int i = 0; i < 2; i++) { + vpx_codec_destroy(&enc[i]); + } + } +} + } // namespace diff --git a/libs/libvpx/test/encode_test_driver.cc b/libs/libvpx/test/encode_test_driver.cc index 632c98f05a..b2cbc3f05b 100644 --- a/libs/libvpx/test/encode_test_driver.cc +++ b/libs/libvpx/test/encode_test_driver.cc @@ -201,6 +201,8 @@ void EncoderTest::RunLoop(VideoSource *video) { PreEncodeFrameHook(video, encoder.get()); encoder->EncodeFrame(video, frame_flags_); + PostEncodeFrameHook(encoder.get()); + CxDataIterator iter = encoder->GetCxData(); bool has_cxdata = false; @@ -226,6 +228,8 @@ void EncoderTest::RunLoop(VideoSource *video) { case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break; + case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break; + default: break; } } diff --git a/libs/libvpx/test/encode_test_driver.h b/libs/libvpx/test/encode_test_driver.h index 09b1a78344..89a3b1767e 100644 --- a/libs/libvpx/test/encode_test_driver.h +++ b/libs/libvpx/test/encode_test_driver.h @@ -139,6 +139,13 @@ class Encoder { } #endif +#if CONFIG_VP8_ENCODER + void Control(int ctrl_id, vpx_roi_map_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } +#endif + void Config(const vpx_codec_enc_cfg_t *cfg) { const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); @@ -212,12 +219,17 @@ class EncoderTest { virtual void PreEncodeFrameHook(VideoSource * /*video*/, Encoder * /*encoder*/) {} + virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {} + // Hook to be called on every compressed data packet. virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} // Hook to be called on every PSNR packet. virtual void PSNRPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} + // Hook to be called on every first pass stats packet. + virtual void StatsPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} + // Hook to determine whether the encode loop should continue. virtual bool Continue() const { return !(::testing::Test::HasFatalFailure() || abort_); diff --git a/libs/libvpx/test/error_resilience_test.cc b/libs/libvpx/test/error_resilience_test.cc index 4c7ede8cae..030b67c572 100644 --- a/libs/libvpx/test/error_resilience_test.cc +++ b/libs/libvpx/test/error_resilience_test.cc @@ -90,8 +90,7 @@ class ErrorResilienceTestLarge return frame_flags; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - ::libvpx_test::Encoder * /*encoder*/) { + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) { frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF); // For temporal layer case. diff --git a/libs/libvpx/test/examples.sh b/libs/libvpx/test/examples.sh index 39f7e392db..629f04239c 100755 --- a/libs/libvpx/test/examples.sh +++ b/libs/libvpx/test/examples.sh @@ -15,7 +15,7 @@ example_tests=$(ls $(dirname $0)/*.sh) # List of script names to exclude. -exclude_list="examples tools_common" +exclude_list="examples stress tools_common" # Filter out the scripts in $exclude_list. for word in ${exclude_list}; do diff --git a/libs/libvpx/test/external_frame_buffer_test.cc b/libs/libvpx/test/external_frame_buffer_test.cc index f9686695a7..dbf2971198 100644 --- a/libs/libvpx/test/external_frame_buffer_test.cc +++ b/libs/libvpx/test/external_frame_buffer_test.cc @@ -34,7 +34,8 @@ struct ExternalFrameBuffer { // Class to manipulate a list of external frame buffers. class ExternalFrameBufferList { public: - ExternalFrameBufferList() : num_buffers_(0), ext_fb_list_(NULL) {} + ExternalFrameBufferList() + : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {} virtual ~ExternalFrameBufferList() { for (int i = 0; i < num_buffers_; ++i) { @@ -71,6 +72,8 @@ class ExternalFrameBufferList { } SetFrameBuffer(idx, fb); + + num_used_buffers_++; return 0; } @@ -106,6 +109,7 @@ class ExternalFrameBufferList { } EXPECT_EQ(1, ext_fb->in_use); ext_fb->in_use = 0; + num_used_buffers_--; return 0; } @@ -121,6 +125,8 @@ class ExternalFrameBufferList { } } + int num_used_buffers() const { return num_used_buffers_; } + private: // Returns the index of the first free frame buffer. Returns |num_buffers_| // if there are no free frame buffers. @@ -145,6 +151,7 @@ class ExternalFrameBufferList { } int num_buffers_; + int num_used_buffers_; ExternalFrameBuffer *ext_fb_list_; }; @@ -220,8 +227,8 @@ class ExternalFrameBufferMD5Test void OpenMD5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: " - << md5_file_name_; + ASSERT_TRUE(md5_file_ != NULL) + << "Md5 file open failed. Filename: " << md5_file_name_; } virtual void DecompressedFrameHook(const vpx_image_t &img, @@ -273,6 +280,7 @@ class ExternalFrameBufferMD5Test #if CONFIG_WEBM_IO const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm"; +const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm"; // Class for testing passing in external frame buffers to libvpx. class ExternalFrameBufferTest : public ::testing::Test { @@ -292,7 +300,9 @@ class ExternalFrameBufferTest : public ::testing::Test { virtual void TearDown() { delete decoder_; + decoder_ = NULL; delete video_; + video_ = NULL; } // Passes the external frame buffer information to libvpx. @@ -325,7 +335,7 @@ class ExternalFrameBufferTest : public ::testing::Test { return VPX_CODEC_OK; } - private: + protected: void CheckDecodedFrames() { libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData(); const vpx_image_t *img = NULL; @@ -341,6 +351,25 @@ class ExternalFrameBufferTest : public ::testing::Test { int num_buffers_; ExternalFrameBufferList fb_list_; }; + +class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { + protected: + virtual void SetUp() { + video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile); + ASSERT_TRUE(video_ != NULL); + video_->Init(); + video_->Begin(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + decoder_ = new libvpx_test::VP9Decoder(cfg, 0); + ASSERT_TRUE(decoder_ != NULL); + } + + virtual void CheckFrameBufferRelease() { + TearDown(); + ASSERT_EQ(0, fb_list_.num_used_buffers()); + } +}; #endif // CONFIG_WEBM_IO // This test runs through the set of test vectors, and decodes them. @@ -419,6 +448,8 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) { SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer)); ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame()); + // Only run this on long clips. Decoding a very short clip will return + // VPX_CODEC_OK even with only 2 buffers. ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames()); } @@ -467,6 +498,15 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) { SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer)); } + +TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames()); + CheckFrameBufferRelease(); +} #endif // CONFIG_WEBM_IO VP9_INSTANTIATE_TEST_CASE( diff --git a/libs/libvpx/test/fdct4x4_test.cc b/libs/libvpx/test/fdct4x4_test.cc deleted file mode 100644 index 1270dae94a..0000000000 --- a/libs/libvpx/test/fdct4x4_test.cc +++ /dev/null @@ -1,512 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include - -#include "third_party/googletest/src/include/gtest/gtest.h" - -#include "./vp9_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "test/acm_random.h" -#include "test/clear_system_state.h" -#include "test/register_state_check.h" -#include "test/util.h" -#include "vp9/common/vp9_entropy.h" -#include "vpx/vpx_codec.h" -#include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" - -using libvpx_test::ACMRandom; - -namespace { -const int kNumCoeffs = 16; -typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, - int tx_type); -typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, - int tx_type); - -typedef std::tr1::tuple Dct4x4Param; -typedef std::tr1::tuple Ht4x4Param; - -void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride, - int /*tx_type*/) { - vpx_fdct4x4_c(in, out, stride); -} - -void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { - vp9_fht4x4_c(in, out, stride, tx_type); -} - -void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride, - int /*tx_type*/) { - vp9_fwht4x4_c(in, out, stride); -} - -#if CONFIG_VP9_HIGHBITDEPTH -void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, out, stride, 10); -} - -void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, out, stride, 12); -} - -void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10); -} - -void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12); -} - -void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10); -} - -void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12); -} - -#if HAVE_SSE2 -void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10); -} - -void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12); -} -#endif // HAVE_SSE2 -#endif // CONFIG_VP9_HIGHBITDEPTH - -class Trans4x4TestBase { - public: - virtual ~Trans4x4TestBase() {} - - protected: - virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0; - - virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0; - - void RunAccuracyCheck(int limit) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - uint32_t max_error = 0; - int64_t total_error = 0; - const int count_test_block = 10000; - for (int i = 0; i < count_test_block; ++i) { - DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); - DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); -#endif - - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < kNumCoeffs; ++j) { - if (bit_depth_ == VPX_BITS_8) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); - test_input_block[j] = src[j] - dst[j]; -#if CONFIG_VP9_HIGHBITDEPTH - } else { - src16[j] = rnd.Rand16() & mask_; - dst16[j] = rnd.Rand16() & mask_; - test_input_block[j] = src16[j] - dst16[j]; -#endif - } - } - - ASM_REGISTER_STATE_CHECK( - RunFwdTxfm(test_input_block, test_temp_block, pitch_)); - if (bit_depth_ == VPX_BITS_8) { - ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_)); -#if CONFIG_VP9_HIGHBITDEPTH - } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); -#endif - } - - for (int j = 0; j < kNumCoeffs; ++j) { -#if CONFIG_VP9_HIGHBITDEPTH - const int diff = - bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; -#else - ASSERT_EQ(VPX_BITS_8, bit_depth_); - const int diff = dst[j] - src[j]; -#endif - const uint32_t error = diff * diff; - if (max_error < error) max_error = error; - total_error += error; - } - } - - EXPECT_GE(static_cast(limit), max_error) - << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit; - - EXPECT_GE(count_test_block * limit, total_error) - << "Error: 4x4 FHT/IHT has average round trip error > " << limit - << " per block"; - } - - void RunCoeffCheck() { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 5000; - DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); - } - - fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_); - ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_)); - - // The minimum quant value is 4. - for (int j = 0; j < kNumCoeffs; ++j) - EXPECT_EQ(output_block[j], output_ref_block[j]); - } - } - - void RunMemCheck() { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 5000; - DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_; - } - if (i == 0) { - for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_; - } else if (i == 1) { - for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_; - } - - fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_); - ASM_REGISTER_STATE_CHECK( - RunFwdTxfm(input_extreme_block, output_block, pitch_)); - - // The minimum quant value is 4. - for (int j = 0; j < kNumCoeffs; ++j) { - EXPECT_EQ(output_block[j], output_ref_block[j]); - EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j])) - << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; - } - } - } - - void RunInvAccuracyCheck(int limit) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 1000; - DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); - DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); -#endif - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - if (bit_depth_ == VPX_BITS_8) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); - in[j] = src[j] - dst[j]; -#if CONFIG_VP9_HIGHBITDEPTH - } else { - src16[j] = rnd.Rand16() & mask_; - dst16[j] = rnd.Rand16() & mask_; - in[j] = src16[j] - dst16[j]; -#endif - } - } - - fwd_txfm_ref(in, coeff, pitch_, tx_type_); - - if (bit_depth_ == VPX_BITS_8) { - ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); -#if CONFIG_VP9_HIGHBITDEPTH - } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); -#endif - } - - for (int j = 0; j < kNumCoeffs; ++j) { -#if CONFIG_VP9_HIGHBITDEPTH - const int diff = - bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; -#else - const int diff = dst[j] - src[j]; -#endif - const uint32_t error = diff * diff; - EXPECT_GE(static_cast(limit), error) - << "Error: 4x4 IDCT has error " << error << " at index " << j; - } - } - } - - int pitch_; - int tx_type_; - FhtFunc fwd_txfm_ref; - vpx_bit_depth_t bit_depth_; - int mask_; -}; - -class Trans4x4DCT : public Trans4x4TestBase, - public ::testing::TestWithParam { - public: - virtual ~Trans4x4DCT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fdct4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride); - } - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride); - } - - FdctFunc fwd_txfm_; - IdctFunc inv_txfm_; -}; - -TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(1); } - -TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } - -class Trans4x4HT : public Trans4x4TestBase, - public ::testing::TestWithParam { - public: - virtual ~Trans4x4HT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fht4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride, tx_type_); - } - - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride, tx_type_); - } - - FhtFunc fwd_txfm_; - IhtFunc inv_txfm_; -}; - -TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1); } - -TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } - -class Trans4x4WHT : public Trans4x4TestBase, - public ::testing::TestWithParam { - public: - virtual ~Trans4x4WHT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fwht4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride); - } - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride); - } - - FdctFunc fwd_txfm_; - IdctFunc inv_txfm_; -}; - -TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0); } - -TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } -using std::tr1::make_tuple; - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4DCT, - ::testing::Values( - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_c, - &vpx_idct4x4_16_add_c, 0, - VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P( - C, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4WHT, - ::testing::Values( - make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12), - make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT, - ::testing::Values(make_tuple(&vp9_fwht4x4_c, - &vpx_iwht4x4_16_add_c, 0, - VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_c, - &vpx_idct4x4_16_add_neon, - 0, VPX_BITS_8))); -#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - NEON, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8))); -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4WHT, - ::testing::Values( - make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8))); -#endif - -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_sse2, - &vpx_idct4x4_16_add_sse2, - 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8))); -#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4DCT, - ::testing::Values( - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12), - make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12), - make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8))); - -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_msa, - &vpx_idct4x4_16_add_msa, 0, - VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - MSA, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8))); -#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -} // namespace diff --git a/libs/libvpx/test/fdct8x8_test.cc b/libs/libvpx/test/fdct8x8_test.cc index 3ac73c1250..5021dda9b3 100644 --- a/libs/libvpx/test/fdct8x8_test.cc +++ b/libs/libvpx/test/fdct8x8_test.cc @@ -88,45 +88,45 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { #if CONFIG_VP9_HIGHBITDEPTH void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_c(in, out, stride, 10); + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_c(in, out, stride, 12); + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } #if HAVE_SSE2 -void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_10_add_c(in, out, stride, 10); +void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } -void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_10_add_c(in, out, stride, 12); +void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } -void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 10); +void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } -void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 12); +void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10); + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12); + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH @@ -257,7 +257,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -340,7 +340,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -413,7 +413,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -497,9 +497,9 @@ class FwdTrans8x8TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_); + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -511,8 +511,8 @@ class FwdTrans8x8TestBase { const int diff = dst[j] - ref[j]; #endif const uint32_t error = diff * diff; - EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error - << " at index " << j; + EXPECT_EQ(0u, error) + << "Error: 8x8 IDCT has error " << error << " at index " << j; } } } @@ -670,14 +670,12 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT, ::testing::Values(make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 0, VPX_BITS_8))); -#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( NEON, FwdTrans8x8HT, ::testing::Values( @@ -685,7 +683,8 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8))); -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT, @@ -728,10 +727,10 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, InvTrans8x8DCT, ::testing::Values( - make_tuple(&idct8x8_10_add_10_c, &idct8x8_10_add_10_sse2, 6225, + make_tuple(&idct8x8_12_add_10_c, &idct8x8_12_add_10_sse2, 6225, VPX_BITS_10), make_tuple(&idct8x8_10, &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10), - make_tuple(&idct8x8_10_add_12_c, &idct8x8_10_add_12_sse2, 6225, + make_tuple(&idct8x8_12_add_12_c, &idct8x8_12_add_12_sse2, 6225, VPX_BITS_12), make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12))); #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE @@ -740,7 +739,7 @@ INSTANTIATE_TEST_CASE_P( !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT, ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3, - &vpx_idct8x8_64_add_ssse3, + &vpx_idct8x8_64_add_sse2, 0, VPX_BITS_8))); #endif @@ -757,4 +756,11 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P(VSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_c, + &vpx_idct8x8_64_add_vsx, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libs/libvpx/test/hadamard_test.cc b/libs/libvpx/test/hadamard_test.cc index e7715958ed..3b7cfeddcf 100644 --- a/libs/libvpx/test/hadamard_test.cc +++ b/libs/libvpx/test/hadamard_test.cc @@ -13,6 +13,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_ports/vpx_timer.h" #include "test/acm_random.h" #include "test/register_state_check.h" @@ -21,7 +22,8 @@ namespace { using ::libvpx_test::ACMRandom; -typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b); +typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride, + tran_low_t *b); void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) { int16_t b[8]; @@ -46,18 +48,16 @@ void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) { out[5] = c[3] - c[7]; } -void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) { +void reference_hadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) { int16_t buf[64]; - for (int i = 0; i < 8; ++i) { - hadamard_loop(a + i, a_stride, buf + i * 8); - } + int16_t buf2[64]; + for (int i = 0; i < 8; ++i) hadamard_loop(a + i, a_stride, buf + i * 8); + for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, 8, buf2 + i * 8); - for (int i = 0; i < 8; ++i) { - hadamard_loop(buf + i, 8, b + i * 8); - } + for (int i = 0; i < 64; ++i) b[i] = (tran_low_t)buf2[i]; } -void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) { +void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) { /* The source is a 16x16 block. The destination is rearranged to 8x32. * Input is 9 bit. */ reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0); @@ -68,16 +68,16 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) { /* Overlay the 8x8 blocks and combine. */ for (int i = 0; i < 64; ++i) { /* 8x8 steps the range up to 15 bits. */ - const int16_t a0 = b[0]; - const int16_t a1 = b[64]; - const int16_t a2 = b[128]; - const int16_t a3 = b[192]; + const tran_low_t a0 = b[0]; + const tran_low_t a1 = b[64]; + const tran_low_t a2 = b[128]; + const tran_low_t a3 = b[192]; /* Prevent the result from escaping int16_t. */ - const int16_t b0 = (a0 + a1) >> 1; - const int16_t b1 = (a0 - a1) >> 1; - const int16_t b2 = (a2 + a3) >> 1; - const int16_t b3 = (a2 - a3) >> 1; + const tran_low_t b0 = (a0 + a1) >> 1; + const tran_low_t b1 = (a0 - a1) >> 1; + const tran_low_t b2 = (a2 + a3) >> 1; + const tran_low_t b3 = (a2 - a3) >> 1; /* Store a 16 bit value. */ b[0] = b0 + b2; @@ -101,12 +101,35 @@ class HadamardTestBase : public ::testing::TestWithParam { ACMRandom rnd_; }; +void HadamardSpeedTest(const char *name, HadamardFunc const func, + const int16_t *input, int stride, tran_low_t *output, + int times) { + int i; + vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + for (i = 0; i < times; ++i) { + func(input, stride, output); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("%s[%12d runs]: %d us\n", name, times, elapsed_time); +} + class Hadamard8x8Test : public HadamardTestBase {}; +void HadamardSpeedTest8x8(HadamardFunc const func, int times) { + DECLARE_ALIGNED(16, int16_t, input[64]); + DECLARE_ALIGNED(16, tran_low_t, output[64]); + memset(input, 1, sizeof(input)); + HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times); +} + TEST_P(Hadamard8x8Test, CompareReferenceRandom) { DECLARE_ALIGNED(16, int16_t, a[64]); - DECLARE_ALIGNED(16, int16_t, b[64]); - int16_t b_ref[64]; + DECLARE_ALIGNED(16, tran_low_t, b[64]); + tran_low_t b_ref[64]; for (int i = 0; i < 64; ++i) { a[i] = rnd_.Rand9Signed(); } @@ -124,8 +147,8 @@ TEST_P(Hadamard8x8Test, CompareReferenceRandom) { TEST_P(Hadamard8x8Test, VaryStride) { DECLARE_ALIGNED(16, int16_t, a[64 * 8]); - DECLARE_ALIGNED(16, int16_t, b[64]); - int16_t b_ref[64]; + DECLARE_ALIGNED(16, tran_low_t, b[64]); + tran_low_t b_ref[64]; for (int i = 0; i < 64 * 8; ++i) { a[i] = rnd_.Rand9Signed(); } @@ -144,6 +167,12 @@ TEST_P(Hadamard8x8Test, VaryStride) { } } +TEST_P(Hadamard8x8Test, DISABLED_Speed) { + HadamardSpeedTest8x8(h_func_, 10); + HadamardSpeedTest8x8(h_func_, 10000); + HadamardSpeedTest8x8(h_func_, 10000000); +} + INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_c)); @@ -162,12 +191,33 @@ INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_neon)); #endif // HAVE_NEON +// TODO(jingning): Remove highbitdepth flag when the SIMD functions are +// in place and turn on the unit test. +#if !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test, + ::testing::Values(&vpx_hadamard_8x8_msa)); +#endif // HAVE_MSA +#endif // !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test, + ::testing::Values(&vpx_hadamard_8x8_vsx)); +#endif // HAVE_VSX + class Hadamard16x16Test : public HadamardTestBase {}; +void HadamardSpeedTest16x16(HadamardFunc const func, int times) { + DECLARE_ALIGNED(16, int16_t, input[256]); + DECLARE_ALIGNED(16, tran_low_t, output[256]); + memset(input, 1, sizeof(input)); + HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times); +} + TEST_P(Hadamard16x16Test, CompareReferenceRandom) { DECLARE_ALIGNED(16, int16_t, a[16 * 16]); - DECLARE_ALIGNED(16, int16_t, b[16 * 16]); - int16_t b_ref[16 * 16]; + DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]); + tran_low_t b_ref[16 * 16]; for (int i = 0; i < 16 * 16; ++i) { a[i] = rnd_.Rand9Signed(); } @@ -185,8 +235,8 @@ TEST_P(Hadamard16x16Test, CompareReferenceRandom) { TEST_P(Hadamard16x16Test, VaryStride) { DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]); - DECLARE_ALIGNED(16, int16_t, b[16 * 16]); - int16_t b_ref[16 * 16]; + DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]); + tran_low_t b_ref[16 * 16]; for (int i = 0; i < 16 * 16 * 8; ++i) { a[i] = rnd_.Rand9Signed(); } @@ -205,6 +255,12 @@ TEST_P(Hadamard16x16Test, VaryStride) { } } +TEST_P(Hadamard16x16Test, DISABLED_Speed) { + HadamardSpeedTest16x16(h_func_, 10); + HadamardSpeedTest16x16(h_func_, 10000); + HadamardSpeedTest16x16(h_func_, 10000000); +} + INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_c)); @@ -213,8 +269,25 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_sse2)); #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test, + ::testing::Values(&vpx_hadamard_16x16_avx2)); +#endif // HAVE_AVX2 + +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test, + ::testing::Values(&vpx_hadamard_16x16_vsx)); +#endif // HAVE_VSX + #if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_neon)); #endif // HAVE_NEON + +#if !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test, + ::testing::Values(&vpx_hadamard_16x16_msa)); +#endif // HAVE_MSA +#endif // !CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/libs/libvpx/test/idct_test.cc b/libs/libvpx/test/idct_test.cc index f54f2c005a..3700374d7a 100644 --- a/libs/libvpx/test/idct_test.cc +++ b/libs/libvpx/test/idct_test.cc @@ -13,6 +13,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "vpx/vpx_integer.h" @@ -21,106 +22,156 @@ typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); namespace { + +using libvpx_test::Buffer; + class IDCTTest : public ::testing::TestWithParam { protected: virtual void SetUp() { - int i; - UUT = GetParam(); - memset(input, 0, sizeof(input)); - /* Set up guard blocks */ - for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1; + + input = new Buffer(4, 4, 0); + ASSERT_TRUE(input != NULL); + ASSERT_TRUE(input->Init()); + predict = new Buffer(4, 4, 3); + ASSERT_TRUE(predict != NULL); + ASSERT_TRUE(predict->Init()); + output = new Buffer(4, 4, 3); + ASSERT_TRUE(output != NULL); + ASSERT_TRUE(output->Init()); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + virtual void TearDown() { + delete input; + delete predict; + delete output; + libvpx_test::ClearSystemState(); + } IdctFunc UUT; - int16_t input[16]; - unsigned char output[256]; - unsigned char predict[256]; + Buffer *input; + Buffer *predict; + Buffer *output; }; -TEST_P(IDCTTest, TestGuardBlocks) { - int i; - - for (i = 0; i < 256; i++) { - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(0, output[i]) << i; - else - EXPECT_EQ(255, output[i]); - } -} - TEST_P(IDCTTest, TestAllZeros) { - int i; + // When the input is '0' the output will be '0'. + input->Set(0); + predict->Set(0); + output->Set(0); - ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); - for (i = 0; i < 256; i++) { - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(0, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; - } + ASSERT_TRUE(input->CheckValues(0)); + ASSERT_TRUE(input->CheckPadding()); + ASSERT_TRUE(output->CheckValues(0)); + ASSERT_TRUE(output->CheckPadding()); } TEST_P(IDCTTest, TestAllOnes) { - int i; + input->Set(0); + // When the first element is '4' it will fill the output buffer with '1'. + input->TopLeftPixel()[0] = 4; + predict->Set(0); + output->Set(0); - input[0] = 4; - ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); - for (i = 0; i < 256; i++) { - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(1, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; - } + ASSERT_TRUE(output->CheckValues(1)); + ASSERT_TRUE(output->CheckPadding()); } TEST_P(IDCTTest, TestAddOne) { - int i; + // Set the transform output to '1' and make sure it gets added to the + // prediction buffer. + input->Set(0); + input->TopLeftPixel()[0] = 4; + output->Set(0); - for (i = 0; i < 256; i++) predict[i] = i; - input[0] = 4; - ASM_REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16)); - - for (i = 0; i < 256; i++) { - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(i + 1, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; + uint8_t *pred = predict->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + pred[y * predict->stride() + x] = y * 4 + x; + } } + + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); + + uint8_t const *out = output->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + EXPECT_EQ(1 + y * 4 + x, out[y * output->stride() + x]); + } + } + + if (HasFailure()) { + output->DumpBuffer(); + } + + ASSERT_TRUE(output->CheckPadding()); } TEST_P(IDCTTest, TestWithData) { - int i; + // Test a single known input. + predict->Set(0); - for (i = 0; i < 16; i++) input[i] = i; - - ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); - - for (i = 0; i < 256; i++) { - if ((i & 0xF) > 3 || i > 63) - EXPECT_EQ(255, output[i]) << "i==" << i; - else if (i == 0) - EXPECT_EQ(11, output[i]) << "i==" << i; - else if (i == 34) - EXPECT_EQ(1, output[i]) << "i==" << i; - else if (i == 2 || i == 17 || i == 32) - EXPECT_EQ(3, output[i]) << "i==" << i; - else - EXPECT_EQ(0, output[i]) << "i==" << i; + int16_t *in = input->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + in[y * input->stride() + x] = y * 4 + x; + } } + + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); + + uint8_t *out = output->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + switch (y * 4 + x) { + case 0: EXPECT_EQ(11, out[y * output->stride() + x]); break; + case 2: + case 5: + case 8: EXPECT_EQ(3, out[y * output->stride() + x]); break; + case 10: EXPECT_EQ(1, out[y * output->stride() + x]); break; + default: EXPECT_EQ(0, out[y * output->stride() + x]); + } + } + } + + if (HasFailure()) { + output->DumpBuffer(); + } + + ASSERT_TRUE(output->CheckPadding()); } INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c)); + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_neon)); +#endif // HAVE_NEON + #if HAVE_MMX INSTANTIATE_TEST_CASE_P(MMX, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_mmx)); -#endif +#endif // HAVE_MMX + #if HAVE_MSA INSTANTIATE_TEST_CASE_P(MSA, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_msa)); -#endif +#endif // HAVE_MSA + +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmi)); +#endif // HAVE_MMI } diff --git a/libs/libvpx/test/invalid_file_test.cc b/libs/libvpx/test/invalid_file_test.cc index bebbb141d0..79220b0f69 100644 --- a/libs/libvpx/test/invalid_file_test.cc +++ b/libs/libvpx/test/invalid_file_test.cc @@ -45,8 +45,8 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, void OpenResFile(const std::string &res_file_name_) { res_file_ = libvpx_test::OpenTestDataFile(res_file_name_); - ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: " - << res_file_name_; + ASSERT_TRUE(res_file_ != NULL) + << "Result file open failed. Filename: " << res_file_name_; } virtual bool HandleDecodeResult( @@ -120,10 +120,23 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, TEST_P(InvalidFileTest, ReturnCode) { RunTest(); } +#if CONFIG_VP8_DECODER +const DecodeParam kVP8InvalidFileTests[] = { + { 1, "invalid-bug-1443.ivf" }, +}; + +VP8_INSTANTIATE_TEST_CASE(InvalidFileTest, + ::testing::ValuesIn(kVP8InvalidFileTests)); +#endif // CONFIG_VP8_DECODER + +#if CONFIG_VP9_DECODER const DecodeParam kVP9InvalidFileTests[] = { { 1, "invalid-vp90-02-v2.webm" }, #if CONFIG_VP9_HIGHBITDEPTH { 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" }, + { 1, + "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-." + "ivf" }, #endif { 1, "invalid-vp90-03-v3.webm" }, { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" }, @@ -141,10 +154,14 @@ const DecodeParam kVP9InvalidFileTests[] = { { 1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf" }, { 1, "invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf" }, { 1, "invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf" }, + { 1, + "invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf" }, + { 1, "invalid-crbug-667044.webm" }, }; VP9_INSTANTIATE_TEST_CASE(InvalidFileTest, ::testing::ValuesIn(kVP9InvalidFileTests)); +#endif // CONFIG_VP9_DECODER // This class will include test vectors that are expected to fail // peek. However they are still expected to have no fatal failures. @@ -159,12 +176,12 @@ class InvalidFileInvalidPeekTest : public InvalidFileTest { TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); } #if CONFIG_VP8_DECODER -const DecodeParam kVP8InvalidFileTests[] = { +const DecodeParam kVP8InvalidPeekTests[] = { { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" }, }; VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest, - ::testing::ValuesIn(kVP8InvalidFileTests)); + ::testing::ValuesIn(kVP8InvalidPeekTests)); #endif // CONFIG_VP8_DECODER #if CONFIG_VP9_DECODER @@ -184,6 +201,7 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf" }, { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" }, { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" }, + { 2, "invalid-crbug-629481.webm" }, }; INSTANTIATE_TEST_CASE_P( diff --git a/libs/libvpx/test/ivf_video_source.h b/libs/libvpx/test/ivf_video_source.h index b87624a11f..5862d2649f 100644 --- a/libs/libvpx/test/ivf_video_source.h +++ b/libs/libvpx/test/ivf_video_source.h @@ -47,8 +47,8 @@ class IVFVideoSource : public CompressedVideoSource { virtual void Begin() { input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; // Read file header uint8_t file_hdr[kIvfFileHdrSize]; diff --git a/libs/libvpx/test/keyframe_test.cc b/libs/libvpx/test/keyframe_test.cc index 38bd923b7d..ee75f401ca 100644 --- a/libs/libvpx/test/keyframe_test.cc +++ b/libs/libvpx/test/keyframe_test.cc @@ -135,8 +135,8 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { for (std::vector::const_iterator iter = kf_pts_list_.begin(); iter != kf_pts_list_.end(); ++iter) { if (deadline_ == VPX_DL_REALTIME && *iter > 0) - EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame " - << *iter; + EXPECT_EQ(0, (*iter - 1) % 30) + << "Unexpected keyframe at frame " << *iter; else EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter; } diff --git a/libs/libvpx/test/level_test.cc b/libs/libvpx/test/level_test.cc index fbbb539df1..26935a81b0 100644 --- a/libs/libvpx/test/level_test.cc +++ b/libs/libvpx/test/level_test.cc @@ -66,6 +66,36 @@ class LevelTest int level_; }; +TEST_P(LevelTest, TestTargetLevel11Large) { + ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 60); + target_level_ = 11; + cfg_.rc_target_bitrate = 150; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(target_level_, level_); +} + +TEST_P(LevelTest, TestTargetLevel20Large) { + ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 60); + target_level_ = 20; + cfg_.rc_target_bitrate = 1200; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(target_level_, level_); +} + +TEST_P(LevelTest, TestTargetLevel31Large) { + ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); + ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30, + 1, 0, 60); + target_level_ = 31; + cfg_.rc_target_bitrate = 8000; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(target_level_, level_); +} + // Test for keeping level stats only TEST_P(LevelTest, TestTargetLevel0) { ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, @@ -73,11 +103,11 @@ TEST_P(LevelTest, TestTargetLevel0) { target_level_ = 0; min_gf_internal_ = 4; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(11, level_); + ASSERT_GE(11, level_); cfg_.rc_target_bitrate = 1600; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(20, level_); + ASSERT_GE(20, level_); } // Test for level control being turned off @@ -94,12 +124,13 @@ TEST_P(LevelTest, TestTargetLevelApi) { vpx_codec_ctx_t enc; vpx_codec_enc_cfg_t cfg; EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0)); + cfg.rc_target_bitrate = 100; EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, codec, &cfg, 0)); for (int level = 0; level <= 256; ++level) { if (level == 10 || level == 11 || level == 20 || level == 21 || level == 30 || level == 31 || level == 40 || level == 41 || level == 50 || level == 51 || level == 52 || level == 60 || - level == 61 || level == 62 || level == 0 || level == 255) + level == 61 || level == 62 || level == 0 || level == 1 || level == 255) EXPECT_EQ(VPX_CODEC_OK, vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level)); else diff --git a/libs/libvpx/test/lpf_test.cc b/libs/libvpx/test/lpf_test.cc index 552b5e33c2..e04b996cd8 100644 --- a/libs/libvpx/test/lpf_test.cc +++ b/libs/libvpx/test/lpf_test.cc @@ -114,6 +114,18 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit, } } +uint8_t GetOuterThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(3 * MAX_LOOP_FILTER + 5)); +} + +uint8_t GetInnerThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(MAX_LOOP_FILTER + 1)); +} + +uint8_t GetHevThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4); +} + class Loop8Test6Param : public ::testing::TestWithParam { public: virtual ~Loop8Test6Param() {} @@ -162,15 +174,15 @@ TEST_P(Loop8Test6Param, OperationCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -221,15 +233,15 @@ TEST_P(Loop8Test6Param, ValueCheck) { for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -271,27 +283,27 @@ TEST_P(Loop8Test9Param, OperationCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -334,27 +346,27 @@ TEST_P(Loop8Test9Param, ValueCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -402,10 +414,10 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_lpf_vertical_4_c, 8), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, &vpx_highbd_lpf_horizontal_8_c, 8), - make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, - &vpx_highbd_lpf_horizontal_edge_8_c, 8), - make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, - &vpx_highbd_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, + &vpx_highbd_lpf_horizontal_16_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2, + &vpx_highbd_lpf_horizontal_16_dual_c, 8), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, &vpx_highbd_lpf_vertical_8_c, 8), make_tuple(&vpx_highbd_lpf_vertical_16_sse2, @@ -416,10 +428,10 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_lpf_vertical_4_c, 10), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, &vpx_highbd_lpf_horizontal_8_c, 10), - make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, - &vpx_highbd_lpf_horizontal_edge_8_c, 10), - make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, - &vpx_highbd_lpf_horizontal_edge_16_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, + &vpx_highbd_lpf_horizontal_16_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2, + &vpx_highbd_lpf_horizontal_16_dual_c, 10), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, &vpx_highbd_lpf_vertical_8_c, 10), make_tuple(&vpx_highbd_lpf_vertical_16_sse2, @@ -430,10 +442,10 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_lpf_vertical_4_c, 12), make_tuple(&vpx_highbd_lpf_horizontal_8_sse2, &vpx_highbd_lpf_horizontal_8_c, 12), - make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2, - &vpx_highbd_lpf_horizontal_edge_8_c, 12), - make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2, - &vpx_highbd_lpf_horizontal_edge_16_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_16_sse2, + &vpx_highbd_lpf_horizontal_16_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2, + &vpx_highbd_lpf_horizontal_16_dual_c, 12), make_tuple(&vpx_highbd_lpf_vertical_8_sse2, &vpx_highbd_lpf_vertical_8_c, 12), make_tuple(&vpx_highbd_lpf_vertical_16_sse2, @@ -450,10 +462,9 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_sse2, &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8), - make_tuple(&vpx_lpf_horizontal_edge_8_sse2, - &vpx_lpf_horizontal_edge_8_c, 8), - make_tuple(&vpx_lpf_horizontal_edge_16_sse2, - &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_sse2, + &vpx_lpf_horizontal_16_dual_c, 8), make_tuple(&vpx_lpf_vertical_4_sse2, &vpx_lpf_vertical_4_c, 8), make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_sse2, &vpx_lpf_vertical_16_c, 8), @@ -465,10 +476,10 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH) INSTANTIATE_TEST_CASE_P( AVX2, Loop8Test6Param, - ::testing::Values(make_tuple(&vpx_lpf_horizontal_edge_8_avx2, - &vpx_lpf_horizontal_edge_8_c, 8), - make_tuple(&vpx_lpf_horizontal_edge_16_avx2, - &vpx_lpf_horizontal_edge_16_c, 8))); + ::testing::Values(make_tuple(&vpx_lpf_horizontal_16_avx2, + &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_avx2, + &vpx_lpf_horizontal_16_dual_c, 8))); #endif #if HAVE_SSE2 @@ -515,15 +526,89 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_NEON #if CONFIG_VP9_HIGHBITDEPTH -// No neon high bitdepth functions. +INSTANTIATE_TEST_CASE_P( + NEON, Loop8Test6Param, + ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_4_neon, + &vpx_highbd_lpf_horizontal_4_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_8_neon, + &vpx_highbd_lpf_horizontal_8_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_8_neon, + &vpx_highbd_lpf_horizontal_8_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_8_neon, + &vpx_highbd_lpf_horizontal_8_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_16_neon, + &vpx_highbd_lpf_horizontal_16_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_16_neon, + &vpx_highbd_lpf_horizontal_16_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_16_neon, + &vpx_highbd_lpf_horizontal_16_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon, + &vpx_highbd_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon, + &vpx_highbd_lpf_horizontal_16_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon, + &vpx_highbd_lpf_horizontal_16_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_4_neon, + &vpx_highbd_lpf_vertical_4_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_4_neon, + &vpx_highbd_lpf_vertical_4_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_4_neon, + &vpx_highbd_lpf_vertical_4_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_8_neon, + &vpx_highbd_lpf_vertical_8_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_8_neon, + &vpx_highbd_lpf_vertical_8_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_8_neon, + &vpx_highbd_lpf_vertical_8_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_neon, + &vpx_highbd_lpf_vertical_16_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_neon, + &vpx_highbd_lpf_vertical_16_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_neon, + &vpx_highbd_lpf_vertical_16_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon, + &vpx_highbd_lpf_vertical_16_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon, + &vpx_highbd_lpf_vertical_16_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon, + &vpx_highbd_lpf_vertical_16_dual_c, 12))); +INSTANTIATE_TEST_CASE_P( + NEON, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, + &vpx_highbd_lpf_horizontal_4_dual_c, 12), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon, + &vpx_highbd_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon, + &vpx_highbd_lpf_horizontal_8_dual_c, 10), + make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon, + &vpx_highbd_lpf_horizontal_8_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon, + &vpx_highbd_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon, + &vpx_highbd_lpf_vertical_4_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon, + &vpx_highbd_lpf_vertical_4_dual_c, 12), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon, + &vpx_highbd_lpf_vertical_8_dual_c, 8), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon, + &vpx_highbd_lpf_vertical_8_dual_c, 10), + make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon, + &vpx_highbd_lpf_vertical_8_dual_c, 12))); #else INSTANTIATE_TEST_CASE_P( NEON, Loop8Test6Param, ::testing::Values( - make_tuple(&vpx_lpf_horizontal_edge_8_neon, - &vpx_lpf_horizontal_edge_8_c, 8), - make_tuple(&vpx_lpf_horizontal_edge_16_neon, - &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_neon, &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_neon, + &vpx_lpf_horizontal_16_dual_c, 8), make_tuple(&vpx_lpf_vertical_16_neon, &vpx_lpf_vertical_16_c, 8), make_tuple(&vpx_lpf_vertical_16_dual_neon, &vpx_lpf_vertical_16_dual_c, 8), @@ -550,8 +635,9 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_horizontal_8_dspr2, &vpx_lpf_horizontal_8_c, 8), - make_tuple(&vpx_lpf_horizontal_edge_8, &vpx_lpf_horizontal_edge_8, 8), - make_tuple(&vpx_lpf_horizontal_edge_16, &vpx_lpf_horizontal_edge_16, 8), + make_tuple(&vpx_lpf_horizontal_16_dspr2, &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_dspr2, + &vpx_lpf_horizontal_16_dual_c, 8), make_tuple(&vpx_lpf_vertical_4_dspr2, &vpx_lpf_vertical_4_c, 8), make_tuple(&vpx_lpf_vertical_8_dspr2, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_dspr2, &vpx_lpf_vertical_16_c, 8), @@ -576,10 +662,9 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8), - make_tuple(&vpx_lpf_horizontal_edge_8_msa, &vpx_lpf_horizontal_edge_8_c, - 8), - make_tuple(&vpx_lpf_horizontal_edge_16_msa, - &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_msa, + &vpx_lpf_horizontal_16_dual_c, 8), make_tuple(&vpx_lpf_vertical_4_msa, &vpx_lpf_vertical_4_c, 8), make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_msa, &vpx_lpf_vertical_16_c, 8))); diff --git a/libs/libvpx/test/minmax_test.cc b/libs/libvpx/test/minmax_test.cc index e51c9fd48c..9c119116a8 100644 --- a/libs/libvpx/test/minmax_test.cc +++ b/libs/libvpx/test/minmax_test.cc @@ -107,10 +107,10 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) { int min_ref, max_ref, min, max; reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref); ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max)); - EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride - << " and b_stride = " << b_stride; - EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride - << " and b_stride = " << b_stride; + EXPECT_EQ(max_ref, max) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + EXPECT_EQ(min_ref, min) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; } } } @@ -127,4 +127,9 @@ INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_neon)); #endif +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P(MSA, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_msa)); +#endif + } // namespace diff --git a/libs/libvpx/test/partial_idct_test.cc b/libs/libvpx/test/partial_idct_test.cc index de6ff4a01b..f7b50f53a1 100644 --- a/libs/libvpx/test/partial_idct_test.cc +++ b/libs/libvpx/test/partial_idct_test.cc @@ -12,6 +12,8 @@ #include #include +#include + #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vp9_rtcd.h" @@ -23,235 +25,935 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_scan.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; namespace { + typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef std::tr1::tuple +typedef void (*InvTxfmWithBdFunc)(const tran_low_t *in, uint8_t *out, + int stride, int bd); + +template +void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { + (void)bd; + fn(in, out, stride); +} + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out, + int stride, int bd); +template +void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { + fn(in, CAST_TO_SHORTPTR(out), stride, bd); +} +#endif + +typedef std::tr1::tuple PartialInvTxfmParam; const int kMaxNumCoeffs = 1024; +const int kCountTestBlock = 1000; + class PartialIDctTest : public ::testing::TestWithParam { public: virtual ~PartialIDctTest() {} virtual void SetUp() { - ftxfm_ = GET_PARAM(0); - full_itxfm_ = GET_PARAM(1); - partial_itxfm_ = GET_PARAM(2); + rnd_.Reset(ACMRandom::DeterministicSeed()); + fwd_txfm_ = GET_PARAM(0); + full_inv_txfm_ = GET_PARAM(1); + partial_inv_txfm_ = GET_PARAM(2); tx_size_ = GET_PARAM(3); last_nonzero_ = GET_PARAM(4); + bit_depth_ = GET_PARAM(5); + pixel_size_ = GET_PARAM(6); + mask_ = (1 << bit_depth_) - 1; + + switch (tx_size_) { + case TX_4X4: size_ = 4; break; + case TX_8X8: size_ = 8; break; + case TX_16X16: size_ = 16; break; + case TX_32X32: size_ = 32; break; + default: FAIL() << "Wrong Size!"; break; + } + + // Randomize stride_ to a value less than or equal to 1024 + stride_ = rnd_(1024) + 1; + if (stride_ < size_) { + stride_ = size_; + } + // Align stride_ to 16 if it's bigger than 16. + if (stride_ > 16) { + stride_ &= ~15; + } + + input_block_size_ = size_ * size_; + output_block_size_ = size_ * stride_; + + input_block_ = reinterpret_cast( + vpx_memalign(16, sizeof(*input_block_) * input_block_size_)); + output_block_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * output_block_size_)); + output_block_ref_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * output_block_size_)); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + virtual void TearDown() { + vpx_free(input_block_); + input_block_ = NULL; + vpx_free(output_block_); + output_block_ = NULL; + vpx_free(output_block_ref_); + output_block_ref_ = NULL; + libvpx_test::ClearSystemState(); + } + + void InitMem() { + memset(input_block_, 0, sizeof(*input_block_) * input_block_size_); + if (pixel_size_ == 1) { + for (int j = 0; j < output_block_size_; ++j) { + output_block_[j] = output_block_ref_[j] = rnd_.Rand16() & mask_; + } + } else { + ASSERT_EQ(2, pixel_size_); + uint16_t *const output = reinterpret_cast(output_block_); + uint16_t *const output_ref = + reinterpret_cast(output_block_ref_); + for (int j = 0; j < output_block_size_; ++j) { + output[j] = output_ref[j] = rnd_.Rand16() & mask_; + } + } + } + + void InitInput() { + const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4; + int64_t max_energy_leftover = max_coeff * max_coeff; + for (int j = 0; j < last_nonzero_; ++j) { + tran_low_t coeff = static_cast( + sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536); + max_energy_leftover -= static_cast(coeff) * coeff; + if (max_energy_leftover < 0) { + max_energy_leftover = 0; + coeff = 0; + } + input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coeff; + } + } + + void PrintDiff() { + if (memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) { + uint16_t ref, opt; + for (int y = 0; y < size_; y++) { + for (int x = 0; x < size_; x++) { + if (pixel_size_ == 1) { + ref = output_block_ref_[y * stride_ + x]; + opt = output_block_[y * stride_ + x]; + } else { + ref = reinterpret_cast( + output_block_ref_)[y * stride_ + x]; + opt = reinterpret_cast(output_block_)[y * stride_ + x]; + } + if (ref != opt) { + printf("dest[%d][%d] diff:%6d (ref),%6d (opt)\n", y, x, ref, opt); + } + } + } + + printf("\ninput_block_:\n"); + for (int y = 0; y < size_; y++) { + for (int x = 0; x < size_; x++) { + printf("%6d,", input_block_[y * size_ + x]); + } + printf("\n"); + } + } + } protected: int last_nonzero_; TX_SIZE tx_size_; - FwdTxfmFunc ftxfm_; - InvTxfmFunc full_itxfm_; - InvTxfmFunc partial_itxfm_; + tran_low_t *input_block_; + uint8_t *output_block_; + uint8_t *output_block_ref_; + int size_; + int stride_; + int pixel_size_; + int input_block_size_; + int output_block_size_; + int bit_depth_; + int mask_; + FwdTxfmFunc fwd_txfm_; + InvTxfmWithBdFunc full_inv_txfm_; + InvTxfmWithBdFunc partial_inv_txfm_; + ACMRandom rnd_; }; TEST_P(PartialIDctTest, RunQuantCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - int size; - switch (tx_size_) { - case TX_4X4: size = 4; break; - case TX_8X8: size = 8; break; - case TX_16X16: size = 16; break; - case TX_32X32: size = 32; break; - default: FAIL() << "Wrong Size!"; break; - } - DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]); - - const int count_test_block = 1000; - const int block_size = size * size; - + const int count_test_block = (size_ != 4) ? kCountTestBlock : 65536; DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]); DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]); - int max_error = 0; + InitMem(); + for (int i = 0; i < count_test_block; ++i) { - // clear out destination buffer - memset(dst1, 0, sizeof(*dst1) * block_size); - memset(dst2, 0, sizeof(*dst2) * block_size); - memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size); - memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size); - - ACMRandom rnd(ACMRandom::DeterministicSeed()); - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-255, 255]. + // Initialize a test block with input range [-mask_, mask_]. + if (size_ != 4) { if (i == 0) { - for (int j = 0; j < block_size; ++j) input_extreme_block[j] = 255; + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = mask_; + } } else if (i == 1) { - for (int j = 0; j < block_size; ++j) input_extreme_block[j] = -255; + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = -mask_; + } } else { - for (int j = 0; j < block_size; ++j) { - input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255; + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_; } } - - ftxfm_(input_extreme_block, output_ref_block, size); - - // quantization with maximum allowed step sizes - test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336; - for (int j = 1; j < last_nonzero_; ++j) { - test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] = - (output_ref_block[j] / 1828) * 1828; + } else { + // Try all possible combinations. + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = (i & (1 << k)) ? mask_ : -mask_; } } - ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size)); - ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size)); + fwd_txfm_(input_extreme_block, output_ref_block, size_); - for (int j = 0; j < block_size; ++j) { - const int diff = dst1[j] - dst2[j]; - const int error = diff * diff; - if (max_error < error) max_error = error; + // quantization with minimum allowed step sizes + input_block_[0] = (output_ref_block[0] / 4) * 4; + for (int k = 1; k < last_nonzero_; ++k) { + const int pos = vp9_default_scan_orders[tx_size_].scan[k]; + input_block_[pos] = (output_ref_block[pos] / 4) * 4; } - } - EXPECT_EQ(0, max_error) - << "Error: partial inverse transform produces different results"; + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + ASM_REGISTER_STATE_CHECK( + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: partial inverse transform produces different results"; + } } TEST_P(PartialIDctTest, ResultsMatch) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - int size; - switch (tx_size_) { - case TX_4X4: size = 4; break; - case TX_8X8: size = 8; break; - case TX_16X16: size = 16; break; - case TX_32X32: size = 32; break; - default: FAIL() << "Wrong Size!"; break; + for (int i = 0; i < kCountTestBlock; ++i) { + InitMem(); + InitInput(); + + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + ASM_REGISTER_STATE_CHECK( + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: partial inverse transform produces different results"; } - DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]); - const int count_test_block = 1000; - const int max_coeff = 32766 / 4; - const int block_size = size * size; - int max_error = 0; - for (int i = 0; i < count_test_block; ++i) { - // clear out destination buffer - memset(dst1, 0, sizeof(*dst1) * block_size); - memset(dst2, 0, sizeof(*dst2) * block_size); - memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size); - memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size); - int max_energy_leftover = max_coeff * max_coeff; +} + +TEST_P(PartialIDctTest, AddOutputBlock) { + for (int i = 0; i < kCountTestBlock; ++i) { + InitMem(); for (int j = 0; j < last_nonzero_; ++j) { - int16_t coef = static_cast(sqrt(1.0 * max_energy_leftover) * - (rnd.Rand16() - 32768) / 65536); - max_energy_leftover -= coef * coef; - if (max_energy_leftover < 0) { - max_energy_leftover = 0; - coef = 0; - } - test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] = coef; + input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = 10; } - memcpy(test_coef_block2, test_coef_block1, - sizeof(*test_coef_block2) * block_size); + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + ASM_REGISTER_STATE_CHECK( + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: Transform results are not correctly added to output."; + } +} - ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size)); - ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size)); +TEST_P(PartialIDctTest, SingleExtremeCoeff) { + const int16_t max_coeff = std::numeric_limits::max(); + const int16_t min_coeff = std::numeric_limits::min(); + for (int i = 0; i < last_nonzero_; ++i) { + memset(input_block_, 0, sizeof(*input_block_) * input_block_size_); + // Run once for min and once for max. + for (int j = 0; j < 2; ++j) { + const int coeff = j ? min_coeff : max_coeff; - for (int j = 0; j < block_size; ++j) { - const int diff = dst1[j] - dst2[j]; - const int error = diff * diff; - if (max_error < error) max_error = error; + memset(output_block_, 0, pixel_size_ * output_block_size_); + memset(output_block_ref_, 0, pixel_size_ * output_block_size_); + input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff; + + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + ASM_REGISTER_STATE_CHECK( + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) + << "Error: Fails with single coeff of " << coeff << " at " << i + << "."; } } +} - EXPECT_EQ(0, max_error) +TEST_P(PartialIDctTest, DISABLED_Speed) { + // Keep runtime stable with transform size. + const int kCountSpeedTestBlock = 500000000 / input_block_size_; + InitMem(); + InitInput(); + + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + ASM_REGISTER_STATE_CHECK( + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + } + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + printf("idct%dx%d_%d (%s %d) time: %5d ms\n", size_, size_, last_nonzero_, + (pixel_size_ == 1) ? "bitdepth" : "high bitdepth", bit_depth_, + elapsed_time); + ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) << "Error: partial inverse transform produces different results"; } + using std::tr1::make_tuple; -INSTANTIATE_TEST_CASE_P( - C, PartialIDctTest, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, - &vpx_idct32x32_34_add_c, TX_32X32, 34), - make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, - &vpx_idct32x32_1_add_c, TX_32X32, 1), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, - &vpx_idct16x16_10_add_c, TX_16X16, 10), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, - &vpx_idct16x16_1_add_c, TX_16X16, 1), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_12_add_c, TX_8X8, 12), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_1_add_c, TX_8X8, 1), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, - &vpx_idct4x4_1_add_c, TX_4X4, 1))); +const PartialInvTxfmParam c_partial_idct_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1024, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1024, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 12, 2), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - NEON, PartialIDctTest, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, - &vpx_idct32x32_1_add_neon, TX_32X32, 1), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, - &vpx_idct16x16_10_add_neon, TX_16X16, 10), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, - &vpx_idct16x16_1_add_neon, TX_16X16, 1), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_12_add_neon, TX_8X8, 12), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_1_add_neon, TX_8X8, 1), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, - &vpx_idct4x4_1_add_neon, TX_4X4, 1))); -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P(C, PartialIDctTest, + ::testing::ValuesIn(c_partial_idct_tests)); -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - SSE2, PartialIDctTest, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, - &vpx_idct32x32_34_add_sse2, TX_32X32, 34), - make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, - &vpx_idct32x32_1_add_sse2, TX_32X32, 1), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, - &vpx_idct16x16_10_add_sse2, TX_16X16, 10), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, - &vpx_idct16x16_1_add_sse2, TX_16X16, 1), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_12_add_sse2, TX_8X8, 12), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_1_add_sse2, TX_8X8, 1), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, - &vpx_idct4x4_1_add_sse2, TX_4X4, 1))); -#endif +#if !CONFIG_EMULATE_HARDWARE -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \ - !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - SSSE3_64, PartialIDctTest, - ::testing::Values(make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_12_add_ssse3, TX_8X8, 12))); -#endif +#if HAVE_NEON +const PartialInvTxfmParam neon_partial_idct_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 12, 2), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; -#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - MSA, PartialIDctTest, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, - &vpx_idct32x32_34_add_msa, TX_32X32, 34), - make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, - &vpx_idct32x32_1_add_msa, TX_32X32, 1), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, - &vpx_idct16x16_10_add_msa, TX_16X16, 10), - make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, - &vpx_idct16x16_1_add_msa, TX_16X16, 1), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_12_add_msa, TX_8X8, 10), - make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, - &vpx_idct8x8_1_add_msa, TX_8X8, 1), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, - &vpx_idct4x4_1_add_msa, TX_4X4, 1))); -#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest, + ::testing::ValuesIn(neon_partial_idct_tests)); +#endif // HAVE_NEON + +#if HAVE_SSE2 +// 32x32_135_ is implemented using the 1024 version. +const PartialInvTxfmParam sse2_partial_idct_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, + &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, + &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 12, 2), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; + +INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest, + ::testing::ValuesIn(sse2_partial_idct_tests)); + +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +const PartialInvTxfmParam ssse3_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1) +}; + +INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest, + ::testing::ValuesIn(ssse3_partial_idct_tests)); +#endif // HAVE_SSSE3 + +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam sse4_1_partial_idct_tests[] = { + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 8, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 10, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2) +}; + +INSTANTIATE_TEST_CASE_P(SSE4_1, PartialIDctTest, + ::testing::ValuesIn(sse4_1_partial_idct_tests)); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam dspr2_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; + +INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest, + ::testing::ValuesIn(dspr2_partial_idct_tests)); +#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH +// 32x32_135_ is implemented using the 1024 version. +const PartialInvTxfmParam msa_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 10, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 1, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 64, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 12, 8, 1), + make_tuple(&vpx_fdct8x8_c, &wrapper, + &wrapper, TX_8X8, 1, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 16, 8, 1), + make_tuple(&vpx_fdct4x4_c, &wrapper, + &wrapper, TX_4X4, 1, 8, 1) +}; + +INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest, + ::testing::ValuesIn(msa_partial_idct_tests)); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH + +#endif // !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/libs/libvpx/test/pp_filter_test.cc b/libs/libvpx/test/pp_filter_test.cc index 2e34fed065..5a2ade1ef4 100644 --- a/libs/libvpx/test/pp_filter_test.cc +++ b/libs/libvpx/test/pp_filter_test.cc @@ -7,22 +7,42 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" -typedef void (*PostProcFunc)(unsigned char *src_ptr, unsigned char *dst_ptr, - int src_pixels_per_line, int dst_pixels_per_line, - int cols, unsigned char *flimit, int size); +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; + +typedef void (*VpxPostProcDownAndAcrossMbRowFunc)( + unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, + int dst_pixels_per_line, int cols, unsigned char *flimit, int size); + +typedef void (*VpxMbPostProcAcrossIpFunc)(unsigned char *src, int pitch, + int rows, int cols, int flimit); + +typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows, + int cols, int flimit); namespace { -class VPxPostProcessingFilterTest - : public ::testing::TestWithParam { +// Compute the filter level used in post proc from the loop filter strength +int q2mbl(int x) { + if (x < 20) x = 20; + + x = 50 + (x - 50) * 10 / 8; + return x * x / 3; +} + +class VpxPostProcDownAndAcrossMbRowTest + : public ::testing::TestWithParam { public: virtual void TearDown() { libvpx_test::ClearSystemState(); } }; @@ -30,31 +50,22 @@ class VPxPostProcessingFilterTest // Test routine for the VPx post-processing function // vpx_post_proc_down_and_across_mb_row_c. -TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) { +TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Size of the underlying data block that will be filtered. const int block_width = 16; const int block_height = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. - const int input_width = block_width; - const int input_height = block_height + 4; - const int input_stride = input_width; - const int input_size = input_width * input_height; + Buffer src_image = Buffer(block_width, block_height, 2); + ASSERT_TRUE(src_image.Init()); // Filter extends output block by 8 samples at left and right edges. - const int output_width = block_width + 16; - const int output_height = block_height; - const int output_stride = output_width; - const int output_size = output_width * output_height; + // Though the left padding is only 8 bytes, the assembly code tries to + // read 16 bytes before the pointer. + Buffer dst_image = + Buffer(block_width, block_height, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); - uint8_t *const src_image = - reinterpret_cast(vpx_calloc(input_size, 1)); - uint8_t *const dst_image = - reinterpret_cast(vpx_calloc(output_size, 1)); - - // Pointers to top-left pixel of block in the input and output images. - uint8_t *const src_image_ptr = src_image + (input_stride << 1); - uint8_t *const dst_image_ptr = dst_image + 8; uint8_t *const flimits = reinterpret_cast(vpx_memalign(16, block_width)); (void)memset(flimits, 255, block_width); @@ -62,53 +73,412 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) { // Initialize pixels in the input: // block pixels to value 1, // border pixels to value 10. - (void)memset(src_image, 10, input_size); - uint8_t *pixel_ptr = src_image_ptr; - for (int i = 0; i < block_height; ++i) { - for (int j = 0; j < block_width; ++j) { - pixel_ptr[j] = 1; - } - pixel_ptr += input_stride; - } + src_image.SetPadding(10); + src_image.Set(1); // Initialize pixels in the output to 99. - (void)memset(dst_image, 99, output_size); + dst_image.Set(99); - ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr, - input_stride, output_stride, block_width, - flimits, 16)); + ASM_REGISTER_STATE_CHECK(GetParam()( + src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(), + dst_image.stride(), block_width, flimits, 16)); - static const uint8_t expected_data[block_height] = { 4, 3, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 3, 4 }; + static const uint8_t kExpectedOutput[block_height] = { + 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4 + }; - pixel_ptr = dst_image_ptr; + uint8_t *pixel_ptr = dst_image.TopLeftPixel(); for (int i = 0; i < block_height; ++i) { for (int j = 0; j < block_width; ++j) { - EXPECT_EQ(expected_data[i], pixel_ptr[j]) - << "VPxPostProcessingFilterTest failed with invalid filter output"; + ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) + << "at (" << i << ", " << j << ")"; } - pixel_ptr += output_stride; + pixel_ptr += dst_image.stride(); } - vpx_free(src_image); - vpx_free(dst_image); vpx_free(flimits); }; +TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { + // Size of the underlying data block that will be filtered. + // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V + // blocks are always a multiple of 8 wide and exactly 8 high. + const int block_width = 136; + const int block_height = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. + Buffer src_image = + Buffer(block_width, block_height, 2, 2, 10, 2); + ASSERT_TRUE(src_image.Init()); + + // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, there is 'above' padding as well + // so when the assembly code tries to read 16 bytes before the pointer it is + // not a problem. + // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. + Buffer dst_image = + Buffer(block_width, block_height, 8, 8, 16, 8); + ASSERT_TRUE(dst_image.Init()); + Buffer dst_image_ref = Buffer(block_width, block_height, 8); + ASSERT_TRUE(dst_image_ref.Init()); + + // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock + // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so + // it must be padded out. + const int flimits_width = block_width % 16 ? block_width + 8 : block_width; + uint8_t *const flimits = + reinterpret_cast(vpx_memalign(16, flimits_width)); + + ACMRandom rnd; + rnd.Reset(ACMRandom::DeterministicSeed()); + // Initialize pixels in the input: + // block pixels to random values. + // border pixels to value 10. + src_image.SetPadding(10); + src_image.Set(&rnd, &ACMRandom::Rand8); + + for (int blocks = 0; blocks < block_width; blocks += 8) { + (void)memset(flimits, 0, sizeof(*flimits) * flimits_width); + + for (int f = 0; f < 255; f++) { + (void)memset(flimits + blocks, f, sizeof(*flimits) * 8); + + dst_image.Set(0); + dst_image_ref.Set(0); + + vpx_post_proc_down_and_across_mb_row_c( + src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(), + src_image.stride(), dst_image_ref.stride(), block_width, flimits, + block_height); + ASM_REGISTER_STATE_CHECK( + GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(), + src_image.stride(), dst_image.stride(), block_width, + flimits, block_height)); + + ASSERT_TRUE(dst_image.CheckValues(dst_image_ref)); + } + } + + vpx_free(flimits); +} + +class VpxMbPostProcAcrossIpTest + : public ::testing::TestWithParam { + public: + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void SetCols(unsigned char *s, int rows, int cols, int src_width) { + for (int r = 0; r < rows; r++) { + for (int c = 0; c < cols; c++) { + s[c] = c; + } + s += src_width; + } + } + + void RunComparison(const unsigned char *expected_output, unsigned char *src_c, + int rows, int cols, int src_pitch) { + for (int r = 0; r < rows; r++) { + for (int c = 0; c < cols; c++) { + ASSERT_EQ(expected_output[c], src_c[c]) + << "at (" << r << ", " << c << ")"; + } + src_c += src_pitch; + } + } + + void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width, + int filter_level, const unsigned char *expected_output) { + ASM_REGISTER_STATE_CHECK( + GetParam()(s, src_width, rows, cols, filter_level)); + RunComparison(expected_output, s, rows, cols, src_width); + } +}; + +TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) { + const int rows = 16; + const int cols = 16; + + Buffer src = Buffer(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(src.Init()); + src.SetPadding(10); + SetCols(src.TopLeftPixel(), rows, cols, src.stride()); + + Buffer expected_output = Buffer(cols, rows, 0); + ASSERT_TRUE(expected_output.Init()); + SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride()); + + RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0), + expected_output.TopLeftPixel()); +} + +TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) { + const int rows = 16; + const int cols = 16; + + Buffer src = Buffer(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(src.Init()); + src.SetPadding(10); + SetCols(src.TopLeftPixel(), rows, cols, src.stride()); + + static const unsigned char kExpectedOutput[cols] = { + 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13 + }; + + RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(70), + kExpectedOutput); +} + +TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) { + const int rows = 16; + const int cols = 16; + + Buffer src = Buffer(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(src.Init()); + src.SetPadding(10); + SetCols(src.TopLeftPixel(), rows, cols, src.stride()); + + static const unsigned char kExpectedOutput[cols] = { + 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13 + }; + + RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), INT_MAX, + kExpectedOutput); + + SetCols(src.TopLeftPixel(), rows, cols, src.stride()); + + RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(100), + kExpectedOutput); +} + +TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) { + const int rows = 16; + const int cols = 16; + + Buffer c_mem = Buffer(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(c_mem.Init()); + Buffer asm_mem = Buffer(cols, rows, 8, 8, 17, 8); + ASSERT_TRUE(asm_mem.Init()); + + // When level >= 100, the filter behaves the same as the level = INT_MAX + // When level < 20, it behaves the same as the level = 0 + for (int level = 0; level < 100; level++) { + c_mem.SetPadding(10); + asm_mem.SetPadding(10); + SetCols(c_mem.TopLeftPixel(), rows, cols, c_mem.stride()); + SetCols(asm_mem.TopLeftPixel(), rows, cols, asm_mem.stride()); + + vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows, + cols, q2mbl(level)); + ASM_REGISTER_STATE_CHECK(GetParam()( + asm_mem.TopLeftPixel(), asm_mem.stride(), rows, cols, q2mbl(level))); + + ASSERT_TRUE(asm_mem.CheckValues(c_mem)); + } +} + +class VpxMbPostProcDownTest + : public ::testing::TestWithParam { + public: + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void SetRows(unsigned char *src_c, int rows, int cols, int src_width) { + for (int r = 0; r < rows; r++) { + memset(src_c, r, cols); + src_c += src_width; + } + } + + void RunComparison(const unsigned char *expected_output, unsigned char *src_c, + int rows, int cols, int src_pitch) { + for (int r = 0; r < rows; r++) { + for (int c = 0; c < cols; c++) { + ASSERT_EQ(expected_output[r * rows + c], src_c[c]) + << "at (" << r << ", " << c << ")"; + } + src_c += src_pitch; + } + } + + void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width, + int filter_level, const unsigned char *expected_output) { + ASM_REGISTER_STATE_CHECK( + GetParam()(s, src_width, rows, cols, filter_level)); + RunComparison(expected_output, s, rows, cols, src_width); + } +}; + +TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { + const int rows = 16; + const int cols = 16; + + Buffer src_c = Buffer(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c.Init()); + src_c.SetPadding(10); + + SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); + + static const unsigned char kExpectedOutput[rows * cols] = { + 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, + 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 4, 4, 3, 3, 3, + 4, 4, 3, 4, 4, 3, 3, 4, 5, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, + 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 8, 9, 9, 8, 8, 8, 9, + 9, 8, 9, 9, 8, 8, 8, 9, 9, 10, 10, 9, 9, 9, 10, 10, 9, 10, 10, + 9, 9, 9, 10, 10, 10, 11, 10, 10, 10, 11, 10, 11, 10, 11, 10, 10, 10, 11, + 10, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 11, 11, 11, 11, 12, 11, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12, + 13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13, + 13, 13, 13, 13, 14, 13, 13, 13, 13 + }; + + RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), INT_MAX, + kExpectedOutput); + + src_c.SetPadding(10); + SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); + RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(100), + kExpectedOutput); +} + +TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { + const int rows = 16; + const int cols = 16; + + Buffer src_c = Buffer(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c.Init()); + src_c.SetPadding(10); + + SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); + + static const unsigned char kExpectedOutput[rows * cols] = { + 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, + 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 13, 12, + 13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13, + 13, 13, 13, 13, 14, 13, 13, 13, 13 + }; + + RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(70), + kExpectedOutput); +} + +TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { + const int rows = 16; + const int cols = 16; + + Buffer src_c = Buffer(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c.Init()); + src_c.SetPadding(10); + + SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); + + unsigned char *expected_output = new unsigned char[rows * cols]; + ASSERT_TRUE(expected_output != NULL); + SetRows(expected_output, rows, cols, cols); + + RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(0), + expected_output); + + delete[] expected_output; +} + +TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { + const int rows = 16; + const int cols = 16; + + ACMRandom rnd; + rnd.Reset(ACMRandom::DeterministicSeed()); + + Buffer src_c = Buffer(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_c.Init()); + Buffer src_asm = Buffer(cols, rows, 8, 8, 8, 17); + ASSERT_TRUE(src_asm.Init()); + + for (int level = 0; level < 100; level++) { + src_c.SetPadding(10); + src_asm.SetPadding(10); + src_c.Set(&rnd, &ACMRandom::Rand8); + src_asm.CopyFrom(src_c); + + vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols, + q2mbl(level)); + ASM_REGISTER_STATE_CHECK(GetParam()( + src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level))); + ASSERT_TRUE(src_asm.CheckValues(src_c)); + + src_c.SetPadding(10); + src_asm.SetPadding(10); + src_c.Set(&rnd, &ACMRandom::Rand8Extremes); + src_asm.CopyFrom(src_c); + + vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols, + q2mbl(level)); + ASM_REGISTER_STATE_CHECK(GetParam()( + src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level))); + ASSERT_TRUE(src_asm.CheckValues(src_c)); + } +} + INSTANTIATE_TEST_CASE_P( - C, VPxPostProcessingFilterTest, + C, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_c)); +INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_c)); + +INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_c)); + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( - SSE2, VPxPostProcessingFilterTest, + SSE2, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2)); -#endif + +INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_sse2)); + +INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_sse2)); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon)); + +INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_neon)); + +INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_neon)); +#endif // HAVE_NEON #if HAVE_MSA INSTANTIATE_TEST_CASE_P( - MSA, VPxPostProcessingFilterTest, + MSA, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa)); -#endif + +INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_msa)); + +INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_msa)); +#endif // HAVE_MSA } // namespace diff --git a/libs/libvpx/test/predict_test.cc b/libs/libvpx/test/predict_test.cc index 07841a1176..9f366ae529 100644 --- a/libs/libvpx/test/predict_test.cc +++ b/libs/libvpx/test/predict_test.cc @@ -292,15 +292,13 @@ INSTANTIATE_TEST_CASE_P( NEON, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon), make_tuple(8, 8, &vp8_sixtap_predict8x8_neon), - make_tuple(8, 4, &vp8_sixtap_predict8x4_neon))); + make_tuple(8, 4, &vp8_sixtap_predict8x4_neon), + make_tuple(4, 4, &vp8_sixtap_predict4x4_neon))); #endif #if HAVE_MMX INSTANTIATE_TEST_CASE_P( MMX, SixtapPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx), - make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx), - make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx), - make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx))); + ::testing::Values(make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx))); #endif #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( @@ -326,6 +324,15 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); #endif +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P( + MMI, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi), + make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi), + make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi), + make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi))); +#endif + class BilinearPredictTest : public PredictTestBase {}; TEST_P(BilinearPredictTest, TestWithRandomData) { @@ -352,9 +359,7 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_MMX INSTANTIATE_TEST_CASE_P( MMX, BilinearPredictTest, - ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_mmx), - make_tuple(8, 8, &vp8_bilinear_predict8x8_mmx), - make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx), + ::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx), make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx))); #endif #if HAVE_SSE2 diff --git a/libs/libvpx/test/quantize_test.cc b/libs/libvpx/test/quantize_test.cc index 69da8994ca..40bb2642e4 100644 --- a/libs/libvpx/test/quantize_test.cc +++ b/libs/libvpx/test/quantize_test.cc @@ -200,4 +200,12 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c), make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c))); #endif // HAVE_MSA + +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P( + MMI, QuantizeTest, + ::testing::Values( + make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c), + make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c))); +#endif // HAVE_MMI } // namespace diff --git a/libs/libvpx/test/register_state_check.h b/libs/libvpx/test/register_state_check.h index b8ba34328c..a779e5c06a 100644 --- a/libs/libvpx/test/register_state_check.h +++ b/libs/libvpx/test/register_state_check.h @@ -32,7 +32,9 @@ #undef NOMINMAX #define NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif #include #include @@ -111,8 +113,8 @@ class RegisterStateCheck { int64_t post_store[8]; vpx_push_neon(post_store); for (int i = 0; i < 8; ++i) { - EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8 - << " has been modified"; + EXPECT_EQ(pre_store_[i], post_store[i]) + << "d" << i + 8 << " has been modified"; } } diff --git a/libs/libvpx/test/resize_test.cc b/libs/libvpx/test/resize_test.cc index c9950dd433..e95dc6651a 100644 --- a/libs/libvpx/test/resize_test.cc +++ b/libs/libvpx/test/resize_test.cc @@ -298,10 +298,10 @@ TEST_P(ResizeTest, TestExternalResizeWorks) { unsigned int expected_h; ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, &expected_h, 0); - EXPECT_EQ(expected_w, info->w) << "Frame " << frame - << " had unexpected width"; - EXPECT_EQ(expected_h, info->h) << "Frame " << frame - << " had unexpected height"; + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; } } @@ -513,10 +513,10 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { unsigned int expected_h; ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, &expected_h, 1); - EXPECT_EQ(expected_w, info->w) << "Frame " << frame - << " had unexpected width"; - EXPECT_EQ(expected_h, info->h) << "Frame " << frame - << " had unexpected height"; + EXPECT_EQ(expected_w, info->w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info->h) + << "Frame " << frame << " had unexpected height"; EXPECT_EQ(static_cast(0), GetMismatchFrames()); } } diff --git a/libs/libvpx/test/sad_test.cc b/libs/libvpx/test/sad_test.cc index 837b08fbdf..67c3c53150 100644 --- a/libs/libvpx/test/sad_test.cc +++ b/libs/libvpx/test/sad_test.cc @@ -644,19 +644,50 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); #if HAVE_NEON const SadMxNParam neon_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_neon), + SadMxNParam(64, 32, &vpx_sad64x32_neon), SadMxNParam(32, 32, &vpx_sad32x32_neon), + SadMxNParam(16, 32, &vpx_sad16x32_neon), SadMxNParam(16, 16, &vpx_sad16x16_neon), SadMxNParam(16, 8, &vpx_sad16x8_neon), SadMxNParam(8, 16, &vpx_sad8x16_neon), SadMxNParam(8, 8, &vpx_sad8x8_neon), + SadMxNParam(8, 4, &vpx_sad8x4_neon), + SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), }; INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); +const SadMxNAvgParam avg_neon_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon), +}; +INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); + const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon), SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon), }; INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); #endif // HAVE_NEON @@ -865,6 +896,14 @@ const SadMxNx4Param x4d_avx2_tests[] = { INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); #endif // HAVE_AVX2 +#if HAVE_AVX512 +const SadMxNx4Param x4d_avx512_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512), +}; +INSTANTIATE_TEST_CASE_P(AVX512, SADx4Test, + ::testing::ValuesIn(x4d_avx512_tests)); +#endif // HAVE_AVX512 + //------------------------------------------------------------------------------ // MIPS functions #if HAVE_MSA @@ -920,4 +959,98 @@ const SadMxNx4Param x4d_msa_tests[] = { INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests)); #endif // HAVE_MSA +//------------------------------------------------------------------------------ +// VSX functions +#if HAVE_VSX +const SadMxNParam vsx_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_vsx), + SadMxNParam(64, 32, &vpx_sad64x32_vsx), + SadMxNParam(32, 64, &vpx_sad32x64_vsx), + SadMxNParam(32, 32, &vpx_sad32x32_vsx), + SadMxNParam(32, 16, &vpx_sad32x16_vsx), + SadMxNParam(16, 32, &vpx_sad16x32_vsx), + SadMxNParam(16, 16, &vpx_sad16x16_vsx), + SadMxNParam(16, 8, &vpx_sad16x8_vsx), +}; +INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests)); + +const SadMxNAvgParam avg_vsx_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx), +}; +INSTANTIATE_TEST_CASE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests)); + +const SadMxNx4Param x4d_vsx_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx), +}; +INSTANTIATE_TEST_CASE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests)); +#endif // HAVE_VSX + +//------------------------------------------------------------------------------ +// Loongson functions +#if HAVE_MMI +const SadMxNParam mmi_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_mmi), + SadMxNParam(64, 32, &vpx_sad64x32_mmi), + SadMxNParam(32, 64, &vpx_sad32x64_mmi), + SadMxNParam(32, 32, &vpx_sad32x32_mmi), + SadMxNParam(32, 16, &vpx_sad32x16_mmi), + SadMxNParam(16, 32, &vpx_sad16x32_mmi), + SadMxNParam(16, 16, &vpx_sad16x16_mmi), + SadMxNParam(16, 8, &vpx_sad16x8_mmi), + SadMxNParam(8, 16, &vpx_sad8x16_mmi), + SadMxNParam(8, 8, &vpx_sad8x8_mmi), + SadMxNParam(8, 4, &vpx_sad8x4_mmi), + SadMxNParam(4, 8, &vpx_sad4x8_mmi), + SadMxNParam(4, 4, &vpx_sad4x4_mmi), +}; +INSTANTIATE_TEST_CASE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests)); + +const SadMxNAvgParam avg_mmi_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi), +}; +INSTANTIATE_TEST_CASE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests)); + +const SadMxNx4Param x4d_mmi_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi), +}; +INSTANTIATE_TEST_CASE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests)); +#endif // HAVE_MMI } // namespace diff --git a/libs/libvpx/test/set_roi.cc b/libs/libvpx/test/set_roi.cc index 38711a806d..f639547523 100644 --- a/libs/libvpx/test/set_roi.cc +++ b/libs/libvpx/test/set_roi.cc @@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) { if (deltas_valid != roi_retval) break; } - // Test that we report and error if cyclic refresh is enabled. - cpi.cyclic_refresh_mode_enabled = 1; - roi_retval = - vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols, - delta_q, delta_lf, threshold); - EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error"; - cpi.cyclic_refresh_mode_enabled = 0; - // Test invalid number of rows or colums. roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1, diff --git a/libs/libvpx/test/stress.sh b/libs/libvpx/test/stress.sh new file mode 100755 index 0000000000..a899c800ca --- /dev/null +++ b/libs/libvpx/test/stress.sh @@ -0,0 +1,169 @@ +#!/bin/sh +## +## Copyright (c) 2016 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This file performs a stress test. It runs (STRESS_ONEPASS_MAX_JOBS, +## default=5) one, (STRESS_TWOPASS_MAX_JOBS, default=5) two pass & +## (STRESS_RT_MAX_JOBS, default=5) encodes and (STRESS__DECODE_MAX_JOBS, +## default=30) decodes in parallel. + +. $(dirname $0)/tools_common.sh + +YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv" +VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm" +VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm" +DATA_URL="http://downloads.webmproject.org/test_data/libvpx/" +SHA1_FILE="$(dirname $0)/test-data.sha1" + +# Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is +# cribbed from libs.mk. +[ -x "$(which sha1sum)" ] && sha1sum=sha1sum +[ -x "$(which shasum)" ] && sha1sum=shasum +[ -x "$(which sha1)" ] && sha1sum=sha1 + +# Download a file from the url and check its sha1sum. +download_and_check_file() { + # Get the file from the file path. + local readonly root="${1#${LIBVPX_TEST_DATA_PATH}/}" + + # Download the file using curl. Trap to insure non partial file. + (trap "rm -f $1" INT TERM \ + && eval "curl --retry 1 -L -o $1 ${DATA_URL}${root} ${devnull}") + + # Check the sha1 sum of the file. + if [ -n "${sha1sum}" ]; then + set -e + grep ${root} ${SHA1_FILE} \ + | (cd ${LIBVPX_TEST_DATA_PATH}; ${sha1sum} -c); + fi +} + +# Environment check: Make sure input is available. +stress_verify_environment() { + if [ ! -e "${SHA1_FILE}" ] ; then + echo "Missing ${SHA1_FILE}" + return 1 + fi + for file in "${YUV}" "${VP8}" "${VP9}"; do + if [ ! -e "${file}" ] ; then + download_and_check_file "${file}" + fi + done + if [ ! -e "${YUV}" ] || [ ! -e "${VP8}" ] || [ ! -e "${VP9}" ] ; then + elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ -z "$(vpx_tool_path vpxenc)" ]; then + elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi + if [ -z "$(vpx_tool_path vpxdec)" ]; then + elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent." + return 1 + fi +} + +# This function runs tests on libvpx that run multiple encodes and decodes +# in parallel in hopes of catching synchronization and/or threading issues. +stress() { + local readonly decoder="$(vpx_tool_path vpxdec)" + local readonly encoder="$(vpx_tool_path vpxenc)" + local readonly codec="$1" + local readonly webm="$2" + local readonly decode_count="$3" + local readonly threads="$4" + local readonly enc_args="$5" + local pids="" + local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5} + local onepass_max_jobs=${STRESS_ONEPASS_MAX_JOBS:-5} + local twopass_max_jobs=${STRESS_TWOPASS_MAX_JOBS:-5} + + # Enable job control, so we can run multiple processes. + set -m + + # Start $onepass_max_jobs encode jobs in parallel. + for i in $(seq ${onepass_max_jobs}); do + bitrate=$(($i * 20 + 300)) + eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=1" \ + "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.1pass.webm" \ + "${enc_args}" ${devnull} & + pids="${pids} $!" + done + + # Start $twopass_max_jobs encode jobs in parallel. + for i in $(seq ${twopass_max_jobs}); do + bitrate=$(($i * 20 + 300)) + eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=2" \ + "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.2pass.webm" \ + "${enc_args}" ${devnull} & + pids="${pids} $!" + done + + # Start $rt_max_jobs rt encode jobs in parallel. + for i in $(seq ${rt_max_jobs}); do + bitrate=$(($i * 20 + 300)) + eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal " \ + "--target-bitrate=${bitrate} --lag-in-frames=0 --error-resilient=1" \ + "--kf-min-dist=3000 --kf-max-dist=3000 --cpu-used=-6 --static-thresh=1" \ + "--end-usage=cbr --min-q=2 --max-q=56 --undershoot-pct=100" \ + "--overshoot-pct=15 --buf-sz=1000 --buf-initial-sz=500" \ + "--buf-optimal-sz=600 --max-intra-rate=900 --resize-allowed=0" \ + "--drop-frame=0 --passes=1 --rt --noise-sensitivity=4" \ + "-o ${VPX_TEST_OUTPUT_DIR}/${i}.rt.webm" ${devnull} & + pids="${pids} $!" + done + + # Start $decode_count decode jobs in parallel. + for i in $(seq "${decode_count}"); do + eval "${decoder}" "-t ${threads}" "${webm}" "--noblit" ${devnull} & + pids="${pids} $!" + done + + # Wait for all parallel jobs to finish. + fail=0 + for job in "${pids}"; do + wait $job || fail=$(($fail + 1)) + done + return $fail +} + +vp8_stress_test() { + local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40} + if [ "$(vp8_decode_available)" = "yes" -a \ + "$(vp8_encode_available)" = "yes" ]; then + stress vp8 "${VP8}" "${vp8_max_jobs}" 4 + fi +} + +vp9_stress() { + local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25} + + if [ "$(vp9_decode_available)" = "yes" -a \ + "$(vp9_encode_available)" = "yes" ]; then + stress vp9 "${VP9}" "${vp9_max_jobs}" "$@" + fi +} + +vp9_stress_test() { + for threads in 4 8 100; do + vp9_stress "$threads" "--row-mt=0" + done +} + +vp9_stress_test_row_mt() { + for threads in 4 8 100; do + vp9_stress "$threads" "--row-mt=1" + done +} + +run_tests stress_verify_environment \ + "vp8_stress_test vp9_stress_test vp9_stress_test_row_mt" diff --git a/libs/libvpx/test/sum_squares_test.cc b/libs/libvpx/test/sum_squares_test.cc index 3aa43d2ce0..9c407c649f 100644 --- a/libs/libvpx/test/sum_squares_test.cc +++ b/libs/libvpx/test/sum_squares_test.cc @@ -110,4 +110,10 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_sse2))); #endif // HAVE_SSE2 + +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple( + &vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_msa))); +#endif // HAVE_MSA } // namespace diff --git a/libs/libvpx/test/svc_test.cc b/libs/libvpx/test/svc_test.cc index 949a820c4f..482d9fffa1 100644 --- a/libs/libvpx/test/svc_test.cc +++ b/libs/libvpx/test/svc_test.cc @@ -438,7 +438,7 @@ TEST_F(SvcTest, SetAutoAltRefOption) { // Test that decoder can handle an SVC frame as the first frame in a sequence. TEST_F(SvcTest, OnePassEncodeOneFrame) { codec_enc_.g_pass = VPX_RC_ONE_PASS; - vpx_fixed_buf output = { 0 }; + vpx_fixed_buf output = vpx_fixed_buf(); Pass2EncodeNFrames(NULL, 1, 2, &output); DecodeNFrames(&output, 1); FreeBitstreamBuffers(&output, 1); diff --git a/libs/libvpx/test/temporal_filter_test.cc b/libs/libvpx/test/temporal_filter_test.cc new file mode 100644 index 0000000000..655a36be9a --- /dev/null +++ b/libs/libvpx/test/temporal_filter_test.cc @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/register_state_check.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +using ::libvpx_test::ACMRandom; +using ::libvpx_test::Buffer; + +typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride, + const uint8_t *b, unsigned int w, + unsigned int h, int filter_strength, + int filter_weight, unsigned int *accumulator, + uint16_t *count); + +// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply +// filter based on strength and weight. Store the resulting filter amount in +// 'count' and apply it to 'b' and store it in 'accumulator'. +void reference_filter(const Buffer &a, const Buffer &b, int w, + int h, int filter_strength, int filter_weight, + Buffer *accumulator, + Buffer *count) { + Buffer diff_sq = Buffer(w, h, 0); + ASSERT_TRUE(diff_sq.Init()); + diff_sq.Set(0); + + int rounding = 0; + if (filter_strength > 0) { + rounding = 1 << (filter_strength - 1); + } + + // Calculate all the differences. Avoids re-calculating a bunch of extra + // values. + for (int height = 0; height < h; ++height) { + for (int width = 0; width < w; ++width) { + int diff = a.TopLeftPixel()[height * a.stride() + width] - + b.TopLeftPixel()[height * b.stride() + width]; + diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff; + } + } + + // For any given point, sum the neighboring values and calculate the + // modifier. + for (int height = 0; height < h; ++height) { + for (int width = 0; width < w; ++width) { + // Determine how many values are being summed. + int summed_values = 9; + + if (height == 0 || height == (h - 1)) { + summed_values -= 3; + } + + if (width == 0 || width == (w - 1)) { + if (summed_values == 6) { // corner + summed_values -= 2; + } else { + summed_values -= 3; + } + } + + // Sum the diff_sq of the surrounding values. + int sum = 0; + for (int idy = -1; idy <= 1; ++idy) { + for (int idx = -1; idx <= 1; ++idx) { + const int y = height + idy; + const int x = width + idx; + + // If inside the border. + if (y >= 0 && y < h && x >= 0 && x < w) { + sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x]; + } + } + } + + sum *= 3; + sum /= summed_values; + sum += rounding; + sum >>= filter_strength; + + // Clamp the value and invert it. + if (sum > 16) sum = 16; + sum = 16 - sum; + + sum *= filter_weight; + + count->TopLeftPixel()[height * count->stride() + width] += sum; + accumulator->TopLeftPixel()[height * accumulator->stride() + width] += + sum * b.TopLeftPixel()[height * b.stride() + width]; + } + } +} + +class TemporalFilterTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + filter_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + protected: + TemporalFilterFunc filter_func_; + ACMRandom rnd_; +}; + +TEST_P(TemporalFilterTest, SizeCombinations) { + // Depending on subsampling this function may be called with values of 8 or 16 + // for width and height, in any combination. + Buffer a = Buffer(16, 16, 8); + ASSERT_TRUE(a.Init()); + + const int filter_weight = 2; + const int filter_strength = 6; + + for (int width = 8; width <= 16; width += 8) { + for (int height = 8; height <= 16; height += 8) { + // The second buffer must not have any border. + Buffer b = Buffer(width, height, 0); + ASSERT_TRUE(b.Init()); + Buffer accum_ref = Buffer(width, height, 0); + ASSERT_TRUE(accum_ref.Init()); + Buffer accum_chk = Buffer(width, height, 0); + ASSERT_TRUE(accum_chk.Init()); + Buffer count_ref = Buffer(width, height, 0); + ASSERT_TRUE(count_ref.Init()); + Buffer count_chk = Buffer(width, height, 0); + ASSERT_TRUE(count_chk.Init()); + + // The difference between the buffers must be small to pass the threshold + // to apply the filter. + a.Set(&rnd_, 0, 7); + b.Set(&rnd_, 0, 7); + + accum_ref.Set(rnd_.Rand8()); + accum_chk.CopyFrom(accum_ref); + count_ref.Set(rnd_.Rand8()); + count_chk.CopyFrom(count_ref); + reference_filter(a, b, width, height, filter_strength, filter_weight, + &accum_ref, &count_ref); + ASM_REGISTER_STATE_CHECK( + filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, + height, filter_strength, filter_weight, + accum_chk.TopLeftPixel(), count_chk.TopLeftPixel())); + EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); + EXPECT_TRUE(count_chk.CheckValues(count_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + count_chk.PrintDifference(count_ref); + accum_chk.PrintDifference(accum_ref); + return; + } + } + } +} + +TEST_P(TemporalFilterTest, CompareReferenceRandom) { + for (int width = 8; width <= 16; width += 8) { + for (int height = 8; height <= 16; height += 8) { + Buffer a = Buffer(width, height, 8); + ASSERT_TRUE(a.Init()); + // The second buffer must not have any border. + Buffer b = Buffer(width, height, 0); + ASSERT_TRUE(b.Init()); + Buffer accum_ref = Buffer(width, height, 0); + ASSERT_TRUE(accum_ref.Init()); + Buffer accum_chk = Buffer(width, height, 0); + ASSERT_TRUE(accum_chk.Init()); + Buffer count_ref = Buffer(width, height, 0); + ASSERT_TRUE(count_ref.Init()); + Buffer count_chk = Buffer(width, height, 0); + ASSERT_TRUE(count_chk.Init()); + + for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) { + for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) { + for (int repeat = 0; repeat < 100; ++repeat) { + if (repeat < 50) { + a.Set(&rnd_, 0, 7); + b.Set(&rnd_, 0, 7); + } else { + // Check large (but close) values as well. + a.Set(&rnd_, std::numeric_limits::max() - 7, + std::numeric_limits::max()); + b.Set(&rnd_, std::numeric_limits::max() - 7, + std::numeric_limits::max()); + } + + accum_ref.Set(rnd_.Rand8()); + accum_chk.CopyFrom(accum_ref); + count_ref.Set(rnd_.Rand8()); + count_chk.CopyFrom(count_ref); + reference_filter(a, b, width, height, filter_strength, + filter_weight, &accum_ref, &count_ref); + ASM_REGISTER_STATE_CHECK(filter_func_( + a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height, + filter_strength, filter_weight, accum_chk.TopLeftPixel(), + count_chk.TopLeftPixel())); + EXPECT_TRUE(accum_chk.CheckValues(accum_ref)); + EXPECT_TRUE(count_chk.CheckValues(count_ref)); + if (HasFailure()) { + printf("Weight: %d Strength: %d\n", filter_weight, + filter_strength); + count_chk.PrintDifference(count_ref); + accum_chk.PrintDifference(accum_ref); + return; + } + } + } + } + } + } +} + +TEST_P(TemporalFilterTest, DISABLED_Speed) { + Buffer a = Buffer(16, 16, 8); + ASSERT_TRUE(a.Init()); + + const int filter_weight = 2; + const int filter_strength = 6; + + for (int width = 8; width <= 16; width += 8) { + for (int height = 8; height <= 16; height += 8) { + // The second buffer must not have any border. + Buffer b = Buffer(width, height, 0); + ASSERT_TRUE(b.Init()); + Buffer accum_ref = Buffer(width, height, 0); + ASSERT_TRUE(accum_ref.Init()); + Buffer accum_chk = Buffer(width, height, 0); + ASSERT_TRUE(accum_chk.Init()); + Buffer count_ref = Buffer(width, height, 0); + ASSERT_TRUE(count_ref.Init()); + Buffer count_chk = Buffer(width, height, 0); + ASSERT_TRUE(count_chk.Init()); + + a.Set(&rnd_, 0, 7); + b.Set(&rnd_, 0, 7); + + accum_chk.Set(0); + count_chk.Set(0); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 10000; ++i) { + filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, + height, filter_strength, filter_weight, + accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Temporal filter %dx%d time: %5d us\n", width, height, + elapsed_time); + } + } +} + +INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest, + ::testing::Values(&vp9_temporal_filter_apply_c)); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest, + ::testing::Values(&vp9_temporal_filter_apply_sse4_1)); +#endif // HAVE_SSE4_1 +} // namespace diff --git a/libs/libvpx/test/test-data.mk b/libs/libvpx/test/test-data.mk index da2fd77d42..f405e4ef14 100644 --- a/libs/libvpx/test/test-data.mk +++ b/libs/libvpx/test/test-data.mk @@ -20,8 +20,10 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv # Test vectors LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf @@ -730,6 +732,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5 endif # CONFIG_VP9_HIGHBITDEPTH # Invalid files for testing libvpx error checking. +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm @@ -764,15 +768,23 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s195 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res ifeq ($(CONFIG_DECODE_PERF_TESTS),yes) # Encode / Decode test @@ -807,7 +819,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv -LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv @@ -865,3 +876,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5 diff --git a/libs/libvpx/test/test-data.sha1 b/libs/libvpx/test/test-data.sha1 index 7dd4fcf156..99b4e1e465 100644 --- a/libs/libvpx/test/test-data.sha1 +++ b/libs/libvpx/test/test-data.sha1 @@ -6,6 +6,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res +efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm @@ -14,6 +16,7 @@ df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm 4935c62becc68c13642a03db1e6d3e2331c1c612 *invalid-vp90-03-v3.webm.res d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm 3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res +9aa21d8b2cb9d39abe8a7bb6032dc66955fb4342 *noisy_clip_640_360.y4m a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m 0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m @@ -838,3 +841,16 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_ 1e75aad3433c5c21c194a7b53fc393970f0a8d7f *invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf.res 235182f9a1c5c8841552510dd4288487447bfc40 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf 787f04f0483320d536894282f3358a4f8cac1cf9 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res +91d3cefd0deb98f3b0caf3a2d900ec7a7605e53a *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf +1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res +70057835bf29d14e66699ce5f022df2551fb6b37 *invalid-crbug-629481.webm +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res +7602e00378161ca36ae93cc6ee12dd30b5ba1e1d *vp90-2-22-svc_1280x720_3.ivf +02e53e3eefbf25ec0929047fe50876acdeb040bd *vp90-2-22-svc_1280x720_3.ivf.md5 +6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm +e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res +fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf +fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res +17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm +e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5 diff --git a/libs/libvpx/test/test.mk b/libs/libvpx/test/test.mk index 60218a780d..a3716be60c 100644 --- a/libs/libvpx/test/test.mk +++ b/libs/libvpx/test/test.mk @@ -1,4 +1,5 @@ LIBVPX_TEST_SRCS-yes += acm_random.h +LIBVPX_TEST_SRCS-yes += buffer.h LIBVPX_TEST_SRCS-yes += clear_system_state.h LIBVPX_TEST_SRCS-yes += codec_factory.h LIBVPX_TEST_SRCS-yes += md5_helper.h @@ -35,9 +36,9 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc @@ -46,6 +47,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.cc @@ -121,6 +123,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc LIBVPX_TEST_SRCS-yes += idct_test.cc LIBVPX_TEST_SRCS-yes += predict_test.cc LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc +LIBVPX_TEST_SRCS-yes += vpx_scale_test.h ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes) LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc @@ -148,14 +151,20 @@ LIBVPX_TEST_SRCS-yes += vp9_intrapred_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc +ifneq ($(CONFIG_REALTIME_ONLY),yes) +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc @@ -166,7 +175,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc endif ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes) -LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc +LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc diff --git a/libs/libvpx/test/test_intra_pred_speed.cc b/libs/libvpx/test/test_intra_pred_speed.cc index a896307534..1cdeda410a 100644 --- a/libs/libvpx/test/test_intra_pred_speed.cc +++ b/libs/libvpx/test/test_intra_pred_speed.cc @@ -29,6 +29,8 @@ namespace { typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +const int kBPS = 32; +const int kTotalPixels = 32 * kBPS; const int kNumVp9IntraPredFuncs = 13; const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = { "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED", @@ -36,107 +38,121 @@ const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = { "D207_PRED", "D63_PRED", "TM_PRED" }; +template +struct IntraPredTestMem { + void Init(int block_size, int bd) { + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + Pixel *const above = above_mem + 16; + const int mask = (1 << bd) - 1; + for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand16() & mask; + for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask; + for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask; + + // some code assumes the top row has been extended: + // d45/d63 C-code, for instance, but not the assembly. + // TODO(jzern): this style of extension isn't strictly necessary. + ASSERT_LE(block_size, kBPS); + for (int i = block_size; i < 2 * kBPS; ++i) { + above[i] = above[block_size - 1]; + } + } + + DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]); + DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]); + DECLARE_ALIGNED(16, Pixel, left[kBPS]); + DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]); +}; + +typedef IntraPredTestMem Vp9IntraPredTestMem; + +void CheckMd5Signature(const char name[], const char *const signatures[], + const void *data, size_t data_size, int elapsed_time, + int idx) { + libvpx_test::MD5 md5; + md5.Add(reinterpret_cast(data), data_size); + printf("Mode %s[%12s]: %5d ms MD5: %s\n", name, kVp9IntraPredNames[idx], + elapsed_time, md5.Get()); + EXPECT_STREQ(signatures[idx], md5.Get()); +} + void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs, - const char *const pred_func_names[], int num_funcs, - const char *const signatures[], int block_size, - int num_pixels_per_test) { - libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); - const int kBPS = 32; - const int kTotalPixels = 32 * kBPS; - DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]); - DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]); - DECLARE_ALIGNED(16, uint8_t, left[kBPS]); - DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]); - uint8_t *const above = above_mem + 16; - for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8(); - for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8(); - for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8(); - const int kNumTests = static_cast(2.e10 / num_pixels_per_test); + const char *const signatures[], int block_size) { + const int kNumTests = static_cast( + 2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs)); + Vp9IntraPredTestMem intra_pred_test_mem; + const uint8_t *const above = intra_pred_test_mem.above_mem + 16; - // some code assumes the top row has been extended: - // d45/d63 C-code, for instance, but not the assembly. - // TODO(jzern): this style of extension isn't strictly necessary. - ASSERT_LE(block_size, kBPS); - memset(above + block_size, above[block_size - 1], 2 * kBPS - block_size); + intra_pred_test_mem.Init(block_size, 8); - for (int k = 0; k < num_funcs; ++k) { + for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { if (pred_funcs[k] == NULL) continue; - memcpy(src, ref_src, sizeof(src)); + memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, + sizeof(intra_pred_test_mem.src)); vpx_usec_timer timer; vpx_usec_timer_start(&timer); for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { - pred_funcs[k](src, kBPS, above, left); + pred_funcs[k](intra_pred_test_mem.src, kBPS, above, + intra_pred_test_mem.left); } libvpx_test::ClearSystemState(); vpx_usec_timer_mark(&timer); const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer) / 1000); - libvpx_test::MD5 md5; - md5.Add(src, sizeof(src)); - printf("Mode %s[%12s]: %5d ms MD5: %s\n", name, pred_func_names[k], - elapsed_time, md5.Get()); - EXPECT_STREQ(signatures[k], md5.Get()); + CheckMd5Signature(name, signatures, intra_pred_test_mem.src, + sizeof(intra_pred_test_mem.src), elapsed_time, k); } } void TestIntraPred4(VpxPredFunc const *pred_funcs) { - static const int kNumVp9IntraFuncs = 13; - static const char *const kSignatures[kNumVp9IntraFuncs] = { - "4334156168b34ab599d9b5b30f522fe9", "bc4649d5ba47c7ff178d92e475960fb0", - "8d316e5933326dcac24e1064794b5d12", "a27270fed024eafd762c95de85f4da51", - "c33dff000d4256c2b8f3bf9e9bab14d2", "44d8cddc2ad8f79b8ed3306051722b4f", - "eb54839b2bad6699d8946f01ec041cd0", "ecb0d56ae5f677ea45127ce9d5c058e4", - "0b7936841f6813da818275944895b574", "9117972ef64f91a58ff73e1731c81db2", - "c56d5e8c729e46825f46dd5d3b5d508a", "c0889e2039bcf7bcb5d2f33cdca69adc", - "309a618577b27c648f9c5ee45252bc8f", + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "e7ed7353c3383fff942e500e9bfe82fe", "2a4a26fcc6ce005eadc08354d196c8a9", + "269d92eff86f315d9c38fe7640d85b15", "ae2960eea9f71ee3dabe08b282ec1773", + "6c1abcc44e90148998b51acd11144e9c", "f7bb3186e1ef8a2b326037ff898cad8e", + "364c1f3fb2f445f935aec2a70a67eaa4", "141624072a4a56773f68fadbdd07c4a7", + "7be49b08687a5f24df3a2c612fca3876", "459bb5d9fd5b238348179c9a22108cd6", + "73edb8831bf1bdfce21ae8eaa43b1234", "2e2457f2009c701a355a8b25eb74fcda", + "52ae4e8bdbe41494c1f43051d4dd7f0b" }; - TestIntraPred("Intra4", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs, - kSignatures, 4, 4 * 4 * kNumVp9IntraFuncs); + TestIntraPred("Intra4", pred_funcs, kSignatures, 4); } void TestIntraPred8(VpxPredFunc const *pred_funcs) { - static const int kNumVp9IntraFuncs = 13; - static const char *const kSignatures[kNumVp9IntraFuncs] = { - "7694ddeeefed887faf9d339d18850928", "7d726b1213591b99f736be6dec65065b", - "19c5711281357a485591aaf9c96c0a67", "ba6b66877a089e71cd938e3b8c40caac", - "802440c93317e0f8ba93fab02ef74265", "9e09a47a15deb0b9d8372824f9805080", - "b7c2d8c662268c0c427da412d7b0311d", "78339c1c60bb1d67d248ab8c4da08b7f", - "5c97d70f7d47de1882a6cd86c165c8a9", "8182bf60688b42205acd95e59e967157", - "08323400005a297f16d7e57e7fe1eaac", "95f7bfc262329a5849eda66d8f7c68ce", - "815b75c8e0d91cc1ae766dc5d3e445a3", + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "d8bbae5d6547cfc17e4f5f44c8730e88", "373bab6d931868d41a601d9d88ce9ac3", + "6fdd5ff4ff79656c14747598ca9e3706", "d9661c2811d6a73674f40ffb2b841847", + "7c722d10b19ccff0b8c171868e747385", "f81dd986eb2b50f750d3a7da716b7e27", + "d500f2c8fc78f46a4c74e4dcf51f14fb", "0e3523f9cab2142dd37fd07ec0760bce", + "79ac4efe907f0a0f1885d43066cfedee", "19ecf2432ac305057de3b6578474eec6", + "4f985b61acc6dd5d2d2585fa89ea2e2d", "f1bb25a9060dd262f405f15a38f5f674", + "209ea00801584829e9a0f7be7d4a74ba" }; - TestIntraPred("Intra8", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs, - kSignatures, 8, 8 * 8 * kNumVp9IntraFuncs); + TestIntraPred("Intra8", pred_funcs, kSignatures, 8); } void TestIntraPred16(VpxPredFunc const *pred_funcs) { - static const int kNumVp9IntraFuncs = 13; - static const char *const kSignatures[kNumVp9IntraFuncs] = { - "b40dbb555d5d16a043dc361e6694fe53", "fb08118cee3b6405d64c1fd68be878c6", - "6c190f341475c837cc38c2e566b64875", "db5c34ccbe2c7f595d9b08b0dc2c698c", - "a62cbfd153a1f0b9fed13e62b8408a7a", "143df5b4c89335e281103f610f5052e4", - "d87feb124107cdf2cfb147655aa0bb3c", "7841fae7d4d47b519322e6a03eeed9dc", - "f6ebed3f71cbcf8d6d0516ce87e11093", "3cc480297dbfeed01a1c2d78dd03d0c5", - "b9f69fa6532b372c545397dcb78ef311", "a8fe1c70432f09d0c20c67bdb6432c4d", - "b8a41aa968ec108af447af4217cba91b", + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "50971c07ce26977d30298538fffec619", "527a6b9e0dc5b21b98cf276305432bef", + "7eff2868f80ebc2c43a4f367281d80f7", "67cd60512b54964ef6aff1bd4816d922", + "48371c87dc95c08a33b2048f89cf6468", "b0acf2872ee411d7530af6d2625a7084", + "f32aafed4d8d3776ed58bcb6188756d5", "dae208f3dca583529cff49b73f7c4183", + "7af66a2f4c8e0b4908e40f047e60c47c", "125e3ab6ab9bc961f183ec366a7afa88", + "6b90f25b23983c35386b9fd704427622", "f8d6b11d710edc136a7c62c917435f93", + "ed308f18614a362917f411c218aee532" }; - TestIntraPred("Intra16", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs, - kSignatures, 16, 16 * 16 * kNumVp9IntraFuncs); + TestIntraPred("Intra16", pred_funcs, kSignatures, 16); } void TestIntraPred32(VpxPredFunc const *pred_funcs) { - static const int kNumVp9IntraFuncs = 13; - static const char *const kSignatures[kNumVp9IntraFuncs] = { - "558541656d84f9ae7896db655826febe", "b3587a1f9a01495fa38c8cd3c8e2a1bf", - "4c6501e64f25aacc55a2a16c7e8f0255", "b3b01379ba08916ef6b1b35f7d9ad51c", - "0f1eb38b6cbddb3d496199ef9f329071", "911c06efb9ed1c3b4c104b232b55812f", - "9225beb0ddfa7a1d24eaa1be430a6654", "0a6d584a44f8db9aa7ade2e2fdb9fc9e", - "b01c9076525216925f3456f034fb6eee", "d267e20ad9e5cd2915d1a47254d3d149", - "ed012a4a5da71f36c2393023184a0e59", "f162b51ed618d28b936974cff4391da5", - "9e1370c6d42e08d357d9612c93a71cfc", + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "a0a618c900e65ae521ccc8af789729f2", "985aaa7c72b4a6c2fb431d32100cf13a", + "10662d09febc3ca13ee4e700120daeb5", "b3b01379ba08916ef6b1b35f7d9ad51c", + "9f4261755795af97e34679c333ec7004", "bc2c9da91ad97ef0d1610fb0a9041657", + "75c79b1362ad18abfcdb1aa0aacfc21d", "4039bb7da0f6860090d3c57b5c85468f", + "b29fff7b61804e68383e3a609b33da58", "e1aa5e49067fd8dba66c2eb8d07b7a89", + "4e042822909c1c06d3b10a88281df1eb", "72eb9d9e0e67c93f4c66b70348e9fef7", + "a22d102bcb51ca798aac12ca4ae8f2e8" }; - TestIntraPred("Intra32", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs, - kSignatures, 32, 32 * 32 * kNumVp9IntraFuncs); + TestIntraPred("Intra32", pred_funcs, kSignatures, 32); } } // namespace @@ -153,7 +169,6 @@ void TestIntraPred32(VpxPredFunc const *pred_funcs) { } // ----------------------------------------------------------------------------- -// 4x4 INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c, vpx_dc_left_predictor_4x4_c, vpx_dc_top_predictor_4x4_c, @@ -163,47 +178,6 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c, vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c, vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c) -#if HAVE_SSE2 -INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2, - vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2, - vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2, - vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL, - NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL, - vpx_tm_predictor_4x4_sse2) -#endif // HAVE_SSE2 - -#if HAVE_SSSE3 -INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL, - vpx_d63_predictor_4x4_ssse3, NULL) -#endif // HAVE_SSSE3 - -#if HAVE_DSPR2 -INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL, - NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_tm_predictor_4x4_dspr2) -#endif // HAVE_DSPR2 - -#if HAVE_NEON -INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, - vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, - vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, - vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, - vpx_tm_predictor_4x4_neon) -#endif // HAVE_NEON - -#if HAVE_MSA -INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa, - vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa, - vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa, - vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_4x4_msa) -#endif // HAVE_MSA - -// ----------------------------------------------------------------------------- -// 8x8 - INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c, vpx_dc_left_predictor_8x8_c, vpx_dc_top_predictor_8x8_c, vpx_dc_128_predictor_8x8_c, vpx_v_predictor_8x8_c, @@ -212,46 +186,6 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c, vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c, vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c) -#if HAVE_SSE2 -INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2, - vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2, - vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2, - vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL, - NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2) -#endif // HAVE_SSE2 - -#if HAVE_SSSE3 -INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_d153_predictor_8x8_ssse3, - vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL) -#endif // HAVE_SSSE3 - -#if HAVE_DSPR2 -INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL, - NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_tm_predictor_8x8_c) -#endif // HAVE_DSPR2 - -#if HAVE_NEON -INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, - vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, - vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, - vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL, - NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon) - -#endif // HAVE_NEON - -#if HAVE_MSA -INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa, - vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa, - vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa, - vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_8x8_msa) -#endif // HAVE_MSA - -// ----------------------------------------------------------------------------- -// 16x16 - INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c, vpx_dc_left_predictor_16x16_c, vpx_dc_top_predictor_16x16_c, vpx_dc_128_predictor_16x16_c, vpx_v_predictor_16x16_c, @@ -260,48 +194,6 @@ INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c, vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c, vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c) -#if HAVE_SSE2 -INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2, - vpx_dc_left_predictor_16x16_sse2, - vpx_dc_top_predictor_16x16_sse2, - vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2, - vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_16x16_sse2) -#endif // HAVE_SSE2 - -#if HAVE_SSSE3 -INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_d45_predictor_16x16_ssse3, NULL, NULL, - vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3, - vpx_d63_predictor_16x16_ssse3, NULL) -#endif // HAVE_SSSE3 - -#if HAVE_DSPR2 -INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL, - NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL) -#endif // HAVE_DSPR2 - -#if HAVE_NEON -INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, - vpx_dc_left_predictor_16x16_neon, - vpx_dc_top_predictor_16x16_neon, - vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, - vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, NULL, - NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_neon) -#endif // HAVE_NEON - -#if HAVE_MSA -INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa, - vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa, - vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa, - vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_16x16_msa) -#endif // HAVE_MSA - -// ----------------------------------------------------------------------------- -// 32x32 - INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c, vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c, vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c, @@ -311,6 +203,26 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c, vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c) #if HAVE_SSE2 +INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2, + vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2, + vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2, + vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL, + NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL, + vpx_tm_predictor_4x4_sse2) + +INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2, + vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2, + vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2, + vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL, + NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2) + +INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2, + vpx_dc_left_predictor_16x16_sse2, + vpx_dc_top_predictor_16x16_sse2, + vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2, + vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_16x16_sse2) + INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, vpx_dc_left_predictor_32x32_sse2, vpx_dc_top_predictor_32x32_sse2, @@ -320,22 +232,79 @@ INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, #endif // HAVE_SSE2 #if HAVE_SSSE3 +INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL, + vpx_d63_predictor_4x4_ssse3, NULL) +INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_d153_predictor_8x8_ssse3, + vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL) +INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_d45_predictor_16x16_ssse3, NULL, NULL, + vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3, + vpx_d63_predictor_16x16_ssse3, NULL) INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL, vpx_d45_predictor_32x32_ssse3, NULL, NULL, vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3, NULL) #endif // HAVE_SSSE3 +#if HAVE_DSPR2 +INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL, + NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_tm_predictor_4x4_dspr2) +INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL, + NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_tm_predictor_8x8_c) +INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL, + NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL, + NULL, NULL, NULL, NULL) +#endif // HAVE_DSPR2 + #if HAVE_NEON +INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, + vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, + vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, + vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, + vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, + vpx_tm_predictor_4x4_neon) +INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, + vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, + vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, + vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, + vpx_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, + vpx_tm_predictor_8x8_neon) +INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, + vpx_dc_left_predictor_16x16_neon, + vpx_dc_top_predictor_16x16_neon, + vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, + vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, + vpx_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL, + vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, - vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, + vpx_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL, vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA +INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa, + vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa, + vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa, + vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_4x4_msa) +INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa, + vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa, + vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa, + vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_8x8_msa) +INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa, + vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa, + vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa, + vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_16x16_msa) INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa, vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa, @@ -343,4 +312,275 @@ INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, vpx_tm_predictor_32x32_msa) #endif // HAVE_MSA +#if HAVE_VSX +INTRA_PRED_TEST(VSX, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, + vpx_h_predictor_4x4_vsx, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_4x4_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, NULL, NULL, NULL, + NULL, vpx_h_predictor_8x8_vsx, vpx_d45_predictor_8x8_vsx, NULL, + NULL, NULL, NULL, vpx_d63_predictor_8x8_vsx, + vpx_tm_predictor_8x8_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx, + vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx, + vpx_dc_128_predictor_16x16_vsx, vpx_v_predictor_16x16_vsx, + vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, NULL, + NULL, NULL, NULL, vpx_d63_predictor_16x16_vsx, + vpx_tm_predictor_16x16_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx, + vpx_dc_left_predictor_32x32_vsx, vpx_dc_top_predictor_32x32_vsx, + vpx_dc_128_predictor_32x32_vsx, vpx_v_predictor_32x32_vsx, + vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, NULL, + NULL, NULL, NULL, vpx_d63_predictor_32x32_vsx, + vpx_tm_predictor_32x32_vsx) +#endif // HAVE_VSX + +// ----------------------------------------------------------------------------- + +#if CONFIG_VP9_HIGHBITDEPTH +namespace { + +typedef void (*VpxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride, + const uint16_t *above, const uint16_t *left, + int bd); + +typedef IntraPredTestMem Vp9HighbdIntraPredTestMem; + +void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs, + const char *const signatures[], int block_size) { + const int kNumTests = static_cast( + 2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs)); + Vp9HighbdIntraPredTestMem intra_pred_test_mem; + const uint16_t *const above = intra_pred_test_mem.above_mem + 16; + + intra_pred_test_mem.Init(block_size, 12); + + for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { + if (pred_funcs[k] == NULL) continue; + memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, + sizeof(intra_pred_test_mem.src)); + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { + pred_funcs[k](intra_pred_test_mem.src, kBPS, above, + intra_pred_test_mem.left, 12); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + CheckMd5Signature(name, signatures, intra_pred_test_mem.src, + sizeof(intra_pred_test_mem.src), elapsed_time, k); + } +} + +void TestHighbdIntraPred4(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "11f74af6c5737df472f3275cbde062fa", "51bea056b6447c93f6eb8f6b7e8f6f71", + "27e97f946766331795886f4de04c5594", "53ab15974b049111fb596c5168ec7e3f", + "f0b640bb176fbe4584cf3d32a9b0320a", "729783ca909e03afd4b47111c80d967b", + "fbf1c30793d9f32812e4d9f905d53530", "293fc903254a33754133314c6cdba81f", + "f8074d704233e73dfd35b458c6092374", "aa6363d08544a1ec4da33d7a0be5640d", + "462abcfdfa3d087bb33c9a88f2aec491", "863eab65d22550dd44a2397277c1ec71", + "23d61df1574d0fa308f9731811047c4b" + }; + TestHighbdIntraPred("Intra4", pred_funcs, kSignatures, 4); +} + +void TestHighbdIntraPred8(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "03da8829fe94663047fd108c5fcaa71d", "ecdb37b8120a2d3a4c706b016bd1bfd7", + "1d4543ed8d2b9368cb96898095fe8a75", "f791c9a67b913cbd82d9da8ecede30e2", + "065c70646f4dbaff913282f55a45a441", "51f87123616662ef7c35691497dfd0ba", + "2a5b0131ef4716f098ee65e6df01e3dd", "9ffe186a6bc7db95275f1bbddd6f7aba", + "a3258a2eae2e2bd55cb8f71351b22998", "8d909f0a2066e39b3216092c6289ece4", + "d183abb30b9f24c886a0517e991b22c7", "702a42fe4c7d665dc561b2aeeb60f311", + "7b5dbbbe7ae3a4ac2948731600bde5d6" + }; + TestHighbdIntraPred("Intra8", pred_funcs, kSignatures, 8); +} + +void TestHighbdIntraPred16(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "e33cb3f56a878e2fddb1b2fc51cdd275", "c7bff6f04b6052c8ab335d726dbbd52d", + "d0b0b47b654a9bcc5c6008110a44589b", "78f5da7b10b2b9ab39f114a33b6254e9", + "c78e31d23831abb40d6271a318fdd6f3", "90d1347f4ec9198a0320daecb6ff90b8", + "d2c623746cbb64a0c9e29c10f2c57041", "cf28bd387b81ad3e5f1a1c779a4b70a0", + "24c304330431ddeaf630f6ce94af2eac", "91a329798036bf64e8e00a87b131b8b1", + "d39111f22885307f920796a42084c872", "e2e702f7250ece98dd8f3f2854c31eeb", + "e2fb05b01eb8b88549e85641d8ce5b59" + }; + TestHighbdIntraPred("Intra16", pred_funcs, kSignatures, 16); +} + +void TestHighbdIntraPred32(VpxHighbdPredFunc const *pred_funcs) { + static const char *const kSignatures[kNumVp9IntraPredFuncs] = { + "a3e8056ba7e36628cce4917cd956fedd", "cc7d3024fe8748b512407edee045377e", + "2aab0a0f330a1d3e19b8ecb8f06387a3", "a547bc3fb7b06910bf3973122a426661", + "26f712514da95042f93d6e8dc8e431dc", "bb08c6e16177081daa3d936538dbc2e3", + "8f031af3e2650e89620d8d2c3a843d8b", "42867c8553285e94ee8e4df7abafbda8", + "6496bdee96100667833f546e1be3d640", "2ebfa25bf981377e682e580208504300", + "3e8ae52fd1f607f348aa4cb436c71ab7", "3d4efe797ca82193613696753ea624c4", + "cb8aab6d372278f3131e8d99efde02d9" + }; + TestHighbdIntraPred("Intra32", pred_funcs, kSignatures, 32); +} + +} // namespace + +// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors +// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4. +#define HIGHBD_INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, \ + v, h, d45, d135, d117, d153, d207, d63, tm) \ + TEST(arch, test_func) { \ + static const VpxHighbdPredFunc vpx_intra_pred[] = { \ + dc, dc_left, dc_top, dc_128, v, h, d45, d135, d117, d153, d207, d63, tm \ + }; \ + test_func(vpx_intra_pred); \ + } + +// ----------------------------------------------------------------------------- + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_c, + vpx_highbd_dc_left_predictor_4x4_c, vpx_highbd_dc_top_predictor_4x4_c, + vpx_highbd_dc_128_predictor_4x4_c, vpx_highbd_v_predictor_4x4_c, + vpx_highbd_h_predictor_4x4_c, vpx_highbd_d45_predictor_4x4_c, + vpx_highbd_d135_predictor_4x4_c, vpx_highbd_d117_predictor_4x4_c, + vpx_highbd_d153_predictor_4x4_c, vpx_highbd_d207_predictor_4x4_c, + vpx_highbd_d63_predictor_4x4_c, vpx_highbd_tm_predictor_4x4_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_c, + vpx_highbd_dc_left_predictor_8x8_c, vpx_highbd_dc_top_predictor_8x8_c, + vpx_highbd_dc_128_predictor_8x8_c, vpx_highbd_v_predictor_8x8_c, + vpx_highbd_h_predictor_8x8_c, vpx_highbd_d45_predictor_8x8_c, + vpx_highbd_d135_predictor_8x8_c, vpx_highbd_d117_predictor_8x8_c, + vpx_highbd_d153_predictor_8x8_c, vpx_highbd_d207_predictor_8x8_c, + vpx_highbd_d63_predictor_8x8_c, vpx_highbd_tm_predictor_8x8_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_c, + vpx_highbd_dc_left_predictor_16x16_c, vpx_highbd_dc_top_predictor_16x16_c, + vpx_highbd_dc_128_predictor_16x16_c, vpx_highbd_v_predictor_16x16_c, + vpx_highbd_h_predictor_16x16_c, vpx_highbd_d45_predictor_16x16_c, + vpx_highbd_d135_predictor_16x16_c, vpx_highbd_d117_predictor_16x16_c, + vpx_highbd_d153_predictor_16x16_c, vpx_highbd_d207_predictor_16x16_c, + vpx_highbd_d63_predictor_16x16_c, vpx_highbd_tm_predictor_16x16_c) + +HIGHBD_INTRA_PRED_TEST( + C, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_c, + vpx_highbd_dc_left_predictor_32x32_c, vpx_highbd_dc_top_predictor_32x32_c, + vpx_highbd_dc_128_predictor_32x32_c, vpx_highbd_v_predictor_32x32_c, + vpx_highbd_h_predictor_32x32_c, vpx_highbd_d45_predictor_32x32_c, + vpx_highbd_d135_predictor_32x32_c, vpx_highbd_d117_predictor_32x32_c, + vpx_highbd_d153_predictor_32x32_c, vpx_highbd_d207_predictor_32x32_c, + vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c) + +#if HAVE_SSE2 +HIGHBD_INTRA_PRED_TEST( + SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2, + vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2, + vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2, + vpx_highbd_h_predictor_4x4_sse2, NULL, vpx_highbd_d135_predictor_4x4_sse2, + vpx_highbd_d117_predictor_4x4_sse2, vpx_highbd_d153_predictor_4x4_sse2, + vpx_highbd_d207_predictor_4x4_sse2, vpx_highbd_d63_predictor_4x4_sse2, + vpx_highbd_tm_predictor_4x4_c) + +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, + vpx_highbd_dc_predictor_8x8_sse2, + vpx_highbd_dc_left_predictor_8x8_sse2, + vpx_highbd_dc_top_predictor_8x8_sse2, + vpx_highbd_dc_128_predictor_8x8_sse2, + vpx_highbd_v_predictor_8x8_sse2, + vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) + +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, + vpx_highbd_dc_predictor_16x16_sse2, + vpx_highbd_dc_left_predictor_16x16_sse2, + vpx_highbd_dc_top_predictor_16x16_sse2, + vpx_highbd_dc_128_predictor_16x16_sse2, + vpx_highbd_v_predictor_16x16_sse2, + vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL, + NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2) + +HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, + vpx_highbd_dc_predictor_32x32_sse2, + vpx_highbd_dc_left_predictor_32x32_sse2, + vpx_highbd_dc_top_predictor_32x32_sse2, + vpx_highbd_dc_128_predictor_32x32_sse2, + vpx_highbd_v_predictor_32x32_sse2, + vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL, + NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2) +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL, + NULL, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3, + vpx_highbd_d135_predictor_8x8_ssse3, + vpx_highbd_d117_predictor_8x8_ssse3, + vpx_highbd_d153_predictor_8x8_ssse3, + vpx_highbd_d207_predictor_8x8_ssse3, + vpx_highbd_d63_predictor_8x8_ssse3, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3, + vpx_highbd_d135_predictor_16x16_ssse3, + vpx_highbd_d117_predictor_16x16_ssse3, + vpx_highbd_d153_predictor_16x16_ssse3, + vpx_highbd_d207_predictor_16x16_ssse3, + vpx_highbd_d63_predictor_16x16_ssse3, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3, + vpx_highbd_d135_predictor_32x32_ssse3, + vpx_highbd_d117_predictor_32x32_ssse3, + vpx_highbd_d153_predictor_32x32_ssse3, + vpx_highbd_d207_predictor_32x32_ssse3, + vpx_highbd_d63_predictor_32x32_ssse3, NULL) +#endif // HAVE_SSSE3 + +#if HAVE_NEON +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon, + vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, + vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, + vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, + vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, + vpx_highbd_tm_predictor_4x4_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, + vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, + vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, + vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, + vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, + vpx_highbd_tm_predictor_8x8_neon) +HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, + vpx_highbd_dc_predictor_16x16_neon, + vpx_highbd_dc_left_predictor_16x16_neon, + vpx_highbd_dc_top_predictor_16x16_neon, + vpx_highbd_dc_128_predictor_16x16_neon, + vpx_highbd_v_predictor_16x16_neon, + vpx_highbd_h_predictor_16x16_neon, + vpx_highbd_d45_predictor_16x16_neon, + vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, + NULL, vpx_highbd_tm_predictor_16x16_neon) +HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, + vpx_highbd_dc_predictor_32x32_neon, + vpx_highbd_dc_left_predictor_32x32_neon, + vpx_highbd_dc_top_predictor_32x32_neon, + vpx_highbd_dc_128_predictor_32x32_neon, + vpx_highbd_v_predictor_32x32_neon, + vpx_highbd_h_predictor_32x32_neon, + vpx_highbd_d45_predictor_32x32_neon, + vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, + NULL, vpx_highbd_tm_predictor_32x32_neon) +#endif // HAVE_NEON + +#endif // CONFIG_VP9_HIGHBITDEPTH + #include "test/test_libvpx.cc" diff --git a/libs/libvpx/test/test_libvpx.cc b/libs/libvpx/test/test_libvpx.cc index 8a70b4e28c..30641ae8c8 100644 --- a/libs/libvpx/test/test_libvpx.cc +++ b/libs/libvpx/test/test_libvpx.cc @@ -53,6 +53,9 @@ int main(int argc, char **argv) { } if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); + if (!(simd_caps & HAS_AVX512)) { + append_negative_gtest_filter(":AVX512.*:AVX512/*"); + } #endif // ARCH_X86 || ARCH_X86_64 #if !CONFIG_SHARED diff --git a/libs/libvpx/test/test_vector_test.cc b/libs/libvpx/test/test_vector_test.cc index 6d43157404..1879b3d277 100644 --- a/libs/libvpx/test/test_vector_test.cc +++ b/libs/libvpx/test/test_vector_test.cc @@ -28,13 +28,10 @@ namespace { -enum DecodeMode { kSerialMode, kFrameParallelMode }; +const int kThreads = 0; +const int kFileName = 1; -const int kDecodeMode = 0; -const int kThreads = 1; -const int kFileName = 2; - -typedef std::tr1::tuple DecodeParam; +typedef std::tr1::tuple DecodeParam; class TestVectorTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { @@ -53,8 +50,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, void OpenMD5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: " - << md5_file_name_; + ASSERT_TRUE(md5_file_ != NULL) + << "Md5 file open failed. Filename: " << md5_file_name_; } virtual void DecompressedFrameHook(const vpx_image_t &img, @@ -92,29 +89,14 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, TEST_P(TestVectorTest, MD5Match) { const DecodeParam input = GET_PARAM(1); const std::string filename = std::tr1::get(input); - const int threads = std::tr1::get(input); - const int mode = std::tr1::get(input); vpx_codec_flags_t flags = 0; vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); char str[256]; - if (mode == kFrameParallelMode) { - flags |= VPX_CODEC_USE_FRAME_THREADING; -#if CONFIG_VP9_DECODER - // TODO(hkuang): Fix frame parallel decode bug. See issue 1086. - if (resize_clips_.find(filename) != resize_clips_.end()) { - printf("Skipping the test file: %s, due to frame parallel decode bug.\n", - filename.c_str()); - return; - } -#endif - } + cfg.threads = std::tr1::get(input); - cfg.threads = threads; - - snprintf(str, sizeof(str) / sizeof(str[0]) - 1, - "file: %s mode: %s threads: %d", filename.c_str(), - mode == 0 ? "Serial" : "Parallel", threads); + snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d", + filename.c_str(), cfg.threads); SCOPED_TRACE(str); // Open compressed video file. @@ -145,38 +127,44 @@ TEST_P(TestVectorTest, MD5Match) { ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); } -// Test VP8 decode in serial mode with single thread. -// NOTE: VP8 only support serial mode. #if CONFIG_VP8_DECODER VP8_INSTANTIATE_TEST_CASE( TestVectorTest, ::testing::Combine( - ::testing::Values(0), // Serial Mode. ::testing::Values(1), // Single thread. ::testing::ValuesIn(libvpx_test::kVP8TestVectors, libvpx_test::kVP8TestVectors + libvpx_test::kNumVP8TestVectors))); + +// Test VP8 decode in with different numbers of threads. +INSTANTIATE_TEST_CASE_P( + VP8MultiThreaded, TestVectorTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP8)), + ::testing::Combine( + ::testing::Range(2, 9), // With 2 ~ 8 threads. + ::testing::ValuesIn(libvpx_test::kVP8TestVectors, + libvpx_test::kVP8TestVectors + + libvpx_test::kNumVP8TestVectors)))); + #endif // CONFIG_VP8_DECODER -// Test VP9 decode in serial mode with single thread. #if CONFIG_VP9_DECODER VP9_INSTANTIATE_TEST_CASE( TestVectorTest, ::testing::Combine( - ::testing::Values(0), // Serial Mode. ::testing::Values(1), // Single thread. ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + libvpx_test::kNumVP9TestVectors))); -// Test VP9 decode in frame parallel mode with different number of threads. INSTANTIATE_TEST_CASE_P( - VP9MultiThreadedFrameParallel, TestVectorTest, + VP9MultiThreaded, TestVectorTest, ::testing::Combine( ::testing::Values( static_cast(&libvpx_test::kVP9)), ::testing::Combine( - ::testing::Values(1), // Frame Parallel mode. ::testing::Range(2, 9), // With 2 ~ 8 threads. ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + diff --git a/libs/libvpx/test/test_vectors.cc b/libs/libvpx/test/test_vectors.cc index 460c1f51b4..3ffc3efc41 100644 --- a/libs/libvpx/test/test_vectors.cc +++ b/libs/libvpx/test/test_vectors.cc @@ -371,9 +371,12 @@ const char *const kVP9TestVectors[] = { #endif // CONFIG_VP9_HIGHBITDEPTH "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm", + "vp90-2-22-svc_1280x720_1.webm", RESIZE_TEST_VECTORS }; +const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" }; const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors); +const int kNumVP9TestVectorsSvc = NELEMENTS(kVP9TestVectorsSvc); const char *const kVP9TestVectorsResize[] = { RESIZE_TEST_VECTORS }; const int kNumVP9TestVectorsResize = NELEMENTS(kVP9TestVectorsResize); #undef RESIZE_TEST_VECTORS diff --git a/libs/libvpx/test/test_vectors.h b/libs/libvpx/test/test_vectors.h index 2c6918abda..3df3e81133 100644 --- a/libs/libvpx/test/test_vectors.h +++ b/libs/libvpx/test/test_vectors.h @@ -23,6 +23,8 @@ extern const char *const kVP8TestVectors[]; #if CONFIG_VP9_DECODER extern const int kNumVP9TestVectors; extern const char *const kVP9TestVectors[]; +extern const int kNumVP9TestVectorsSvc; +extern const char *const kVP9TestVectorsSvc[]; extern const int kNumVP9TestVectorsResize; extern const char *const kVP9TestVectorsResize[]; #endif // CONFIG_VP9_DECODER diff --git a/libs/libvpx/test/twopass_encoder.sh b/libs/libvpx/test/twopass_encoder.sh index 7a223f2afc..eaeaabdfd0 100755 --- a/libs/libvpx/test/twopass_encoder.sh +++ b/libs/libvpx/test/twopass_encoder.sh @@ -54,7 +54,10 @@ twopass_encoder_vp9() { fi } -twopass_encoder_tests="twopass_encoder_vp8 - twopass_encoder_vp9" -run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}" +if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then + twopass_encoder_tests="twopass_encoder_vp8 + twopass_encoder_vp9" + + run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}" +fi diff --git a/libs/libvpx/test/variance_test.cc b/libs/libvpx/test/variance_test.cc index 6e31165faf..421024ad88 100644 --- a/libs/libvpx/test/variance_test.cc +++ b/libs/libvpx/test/variance_test.cc @@ -22,6 +22,7 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" namespace { @@ -345,6 +346,7 @@ class MainTestClass void RefTest(); void RefStrideTest(); void OneQuarterTest(); + void SpeedTest(); // MSE/SSE tests void RefTestMse(); @@ -363,6 +365,7 @@ class MainTestClass int byte_shift() const { return params_.bit_depth - 8; } int block_size() const { return params_.block_size; } int width() const { return params_.width; } + int height() const { return params_.height; } uint32_t mask() const { return params_.mask; } }; @@ -471,6 +474,35 @@ void MainTestClass::OneQuarterTest() { EXPECT_EQ(expected, var); } +template +void MainTestClass::SpeedTest() { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 255, half); + memset(ref_ + half, 0, half); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse; + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < (1 << 30) / block_size(); ++i) { + const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse); + // Ignore return value. + (void)variance; + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Variance %dx%d time: %5d ms\n", width(), height(), + elapsed_time / 1000); +} + //////////////////////////////////////////////////////////////////////////////// // Tests related to MSE / SSE. @@ -529,46 +561,26 @@ void MainTestClass::MaxTestSse() { //////////////////////////////////////////////////////////////////////////////// -using ::std::tr1::get; -using ::std::tr1::make_tuple; -using ::std::tr1::tuple; - -template +template class SubpelVarianceTest - : public ::testing::TestWithParam< - tuple > { + : public ::testing::TestWithParam > { public: virtual void SetUp() { - const tuple ¶ms = - this->GetParam(); - log2width_ = get<0>(params); - width_ = 1 << log2width_; - log2height_ = get<1>(params); - height_ = 1 << log2height_; - subpel_variance_ = get<2>(params); - if (get<3>(params)) { - bit_depth_ = (vpx_bit_depth_t)get<3>(params); - use_high_bit_depth_ = true; - } else { - bit_depth_ = VPX_BITS_8; - use_high_bit_depth_ = false; - } - mask_ = (1 << bit_depth_) - 1; + params_ = this->GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); - block_size_ = width_ * height_; - if (!use_high_bit_depth_) { - src_ = reinterpret_cast(vpx_memalign(16, block_size_)); - sec_ = reinterpret_cast(vpx_memalign(16, block_size_)); - ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; + if (!use_high_bit_depth()) { + src_ = reinterpret_cast(vpx_memalign(16, block_size())); + sec_ = reinterpret_cast(vpx_memalign(16, block_size())); + ref_ = new uint8_t[block_size() + width() + height() + 1]; #if CONFIG_VP9_HIGHBITDEPTH } else { src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( - vpx_memalign(16, block_size_ * sizeof(uint16_t)))); + vpx_memalign(16, block_size() * sizeof(uint16_t)))); sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast( - vpx_memalign(16, block_size_ * sizeof(uint16_t)))); - ref_ = - CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]); + vpx_memalign(16, block_size() * sizeof(uint16_t)))); + ref_ = CONVERT_TO_BYTEPTR( + new uint16_t[block_size() + width() + height() + 1]); #endif // CONFIG_VP9_HIGHBITDEPTH } ASSERT_TRUE(src_ != NULL); @@ -577,7 +589,7 @@ class SubpelVarianceTest } virtual void TearDown() { - if (!use_high_bit_depth_) { + if (!use_high_bit_depth()) { vpx_free(src_); delete[] ref_; vpx_free(sec_); @@ -599,42 +611,45 @@ class SubpelVarianceTest uint8_t *src_; uint8_t *ref_; uint8_t *sec_; - bool use_high_bit_depth_; - vpx_bit_depth_t bit_depth_; - int width_, log2width_; - int height_, log2height_; - int block_size_, mask_; - SubpelVarianceFunctionType subpel_variance_; + TestParams params_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t mask() const { return params_.mask; } }; template void SubpelVarianceTest::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - if (!use_high_bit_depth_) { - for (int j = 0; j < block_size_; j++) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { src_[j] = rnd_.Rand8(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + for (int j = 0; j < block_size() + width() + height() + 1; j++) { ref_[j] = rnd_.Rand8(); } #if CONFIG_VP9_HIGHBITDEPTH } else { - for (int j = 0; j < block_size_; j++) { - CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); } #endif // CONFIG_VP9_HIGHBITDEPTH } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( - var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1)); - const unsigned int var2 = - subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2, - use_high_bit_depth_, bit_depth_); + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; } @@ -648,28 +663,28 @@ void SubpelVarianceTest::ExtremeRefTest() { // Ref: Set the first half of values to the maximum, the second half to 0. for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - const int half = block_size_ / 2; - if (!use_high_bit_depth_) { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { memset(src_, 0, half); memset(src_ + half, 255, half); memset(ref_, 255, half); - memset(ref_ + half, 0, half + width_ + height_ + 1); + memset(ref_ + half, 0, half + width() + height() + 1); #if CONFIG_VP9_HIGHBITDEPTH } else { - vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half); + vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half); vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half); - vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_, - half + width_ + height_ + 1); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(), + half + width() + height() + 1); #endif // CONFIG_VP9_HIGHBITDEPTH } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( - var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1)); - const unsigned int var2 = - subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2, - use_high_bit_depth_, bit_depth_); + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; } @@ -680,33 +695,32 @@ template <> void SubpelVarianceTest::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - if (!use_high_bit_depth_) { - for (int j = 0; j < block_size_; j++) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { src_[j] = rnd_.Rand8(); sec_[j] = rnd_.Rand8(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + for (int j = 0; j < block_size() + width() + height() + 1; j++) { ref_[j] = rnd_.Rand8(); } #if CONFIG_VP9_HIGHBITDEPTH } else { - for (int j = 0; j < block_size_; j++) { - CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; - CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); } #endif // CONFIG_VP9_HIGHBITDEPTH } uint32_t sse1, sse2; uint32_t var1, var2; - ASM_REGISTER_STATE_CHECK(var1 = - subpel_variance_(ref_, width_ + 1, x, y, - src_, width_, &sse1, sec_)); - var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_, - x, y, &sse2, use_high_bit_depth_, - static_cast(bit_depth_)); + ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y, + src_, width(), &sse1, sec_)); + var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width, + params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; } @@ -727,6 +741,7 @@ TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxVarianceTest, Ref) { RefTest(); } TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); } TEST_P(SumOfSquaresTest, Const) { ConstTest(); } TEST_P(SumOfSquaresTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } @@ -765,37 +780,41 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_c), VarianceParams(2, 2, &vpx_variance4x4_c))); +typedef TestParams SubpelVarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); +typedef TestParams SubpelAvgVarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxSubpelAvgVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); #if CONFIG_VP9_HIGHBITDEPTH typedef MainTestClass VpxHBDMseTest; @@ -809,6 +828,7 @@ TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); } TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } @@ -816,18 +836,18 @@ TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } /* TODO(debargha): This test does not support the highbd version INSTANTIATE_TEST_CASE_P( C, VpxHBDMseTest, - ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_12_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_12_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_12_mse8x8_c), - make_tuple(4, 4, &vpx_highbd_10_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_10_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_10_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_10_mse8x8_c), - make_tuple(4, 4, &vpx_highbd_8_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_8_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_8_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_8_mse8x8_c))); + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c), + MseParams(4, 4, &vpx_highbd_12_mse16x8_c), + MseParams(4, 4, &vpx_highbd_12_mse8x16_c), + MseParams(4, 4, &vpx_highbd_12_mse8x8_c), + MseParams(4, 4, &vpx_highbd_10_mse16x16_c), + MseParams(4, 4, &vpx_highbd_10_mse16x8_c), + MseParams(4, 4, &vpx_highbd_10_mse8x16_c), + MseParams(4, 4, &vpx_highbd_10_mse8x8_c), + MseParams(4, 4, &vpx_highbd_8_mse16x16_c), + MseParams(4, 4, &vpx_highbd_8_mse16x8_c), + MseParams(4, 4, &vpx_highbd_8_mse8x16_c), + MseParams(4, 4, &vpx_highbd_8_mse8x8_c))); */ INSTANTIATE_TEST_CASE_P( @@ -875,88 +895,161 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( C, VpxHBDSubpelVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8), - make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8), - make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10), - make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10), - make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10), - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12), - make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12), - make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12))); + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10), + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, + 12))); INSTANTIATE_TEST_CASE_P( C, VpxHBDSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8), - make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8), - make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10), - make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10), - make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10), - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12), - make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12), - make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12))); + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8), + SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, + 8), + SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, + 8), + SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, + 8), + SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, + 8), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_c, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_c, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_c, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_c, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_c, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_c, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_c, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_c, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_c, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_c, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_c, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_c, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_c, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_c, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_c, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_c, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_c, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_c, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_c, + 12))); #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSE2 @@ -987,36 +1080,37 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); INSTANTIATE_TEST_CASE_P( SSE2, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); #if CONFIG_VP9_HIGHBITDEPTH /* TODO(debargha): This test does not support the highbd version @@ -1073,112 +1167,219 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, VpxHBDSubpelVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10), - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8))); + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, + 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, + 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, + 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, + 12), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, + 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, + 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, + 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, + 10), + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, + 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, + 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, + 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, + 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, + 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, + 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, + 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, + 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, + 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, + 8))); INSTANTIATE_TEST_CASE_P( SSE2, VpxHBDSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10), - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8))); + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, + 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, + 12), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, + 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, + 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, + 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, + 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, + 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, + 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, + 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, + 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, + 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, + 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, + 8), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, + 8), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, + 8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 #if HAVE_SSSE3 INSTANTIATE_TEST_CASE_P( SSSE3, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0))); INSTANTIATE_TEST_CASE_P( SSSE3, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, + 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, + 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, + 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, + 0))); #endif // HAVE_SSSE3 #if HAVE_AVX2 @@ -1195,14 +1396,16 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( AVX2, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0))); INSTANTIATE_TEST_CASE_P( AVX2, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, + 0))); #endif // HAVE_AVX2 #if HAVE_NEON @@ -1219,17 +1422,49 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(6, 5, &vpx_variance64x32_neon), VarianceParams(5, 6, &vpx_variance32x64_neon), VarianceParams(5, 5, &vpx_variance32x32_neon), + VarianceParams(5, 4, &vpx_variance32x16_neon), + VarianceParams(4, 5, &vpx_variance16x32_neon), VarianceParams(4, 4, &vpx_variance16x16_neon), VarianceParams(4, 3, &vpx_variance16x8_neon), VarianceParams(3, 4, &vpx_variance8x16_neon), - VarianceParams(3, 3, &vpx_variance8x8_neon))); + VarianceParams(3, 3, &vpx_variance8x8_neon), + VarianceParams(3, 2, &vpx_variance8x4_neon), + VarianceParams(2, 3, &vpx_variance4x8_neon), + VarianceParams(2, 2, &vpx_variance4x4_neon))); INSTANTIATE_TEST_CASE_P( NEON, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0))); + +INSTANTIATE_TEST_CASE_P( + NEON, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0))); #endif // HAVE_NEON #if HAVE_MSA @@ -1264,34 +1499,103 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( MSA, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0), - make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0))); + ::testing::Values( + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0), + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0))); INSTANTIATE_TEST_CASE_P( MSA, VpxSubpelAvgVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0))); + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0))); #endif // HAVE_MSA + +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P(VSX, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_vsx)); + +INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_vsx))); +#endif // HAVE_VSX + +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi), + MseParams(4, 3, &vpx_mse16x8_mmi), + MseParams(3, 4, &vpx_mse8x16_mmi), + MseParams(3, 3, &vpx_mse8x8_mmi))); + +INSTANTIATE_TEST_CASE_P( + MMI, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi), + VarianceParams(6, 5, &vpx_variance64x32_mmi), + VarianceParams(5, 6, &vpx_variance32x64_mmi), + VarianceParams(5, 5, &vpx_variance32x32_mmi), + VarianceParams(5, 4, &vpx_variance32x16_mmi), + VarianceParams(4, 5, &vpx_variance16x32_mmi), + VarianceParams(4, 4, &vpx_variance16x16_mmi), + VarianceParams(4, 3, &vpx_variance16x8_mmi), + VarianceParams(3, 4, &vpx_variance8x16_mmi), + VarianceParams(3, 3, &vpx_variance8x8_mmi), + VarianceParams(3, 2, &vpx_variance8x4_mmi), + VarianceParams(2, 3, &vpx_variance4x8_mmi), + VarianceParams(2, 2, &vpx_variance4x4_mmi))); + +INSTANTIATE_TEST_CASE_P( + MMI, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0))); + +INSTANTIATE_TEST_CASE_P( + MMI, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0))); +#endif // HAVE_MMI } // namespace diff --git a/libs/libvpx/test/video_source.h b/libs/libvpx/test/video_source.h index 424bb2cfb4..54f692865b 100644 --- a/libs/libvpx/test/video_source.h +++ b/libs/libvpx/test/video_source.h @@ -13,7 +13,9 @@ #if defined(_WIN32) #undef NOMINMAX #define NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif #include #endif #include diff --git a/libs/libvpx/test/vp8_fdct4x4_test.cc b/libs/libvpx/test/vp8_fdct4x4_test.cc index da4f0caa1e..b7697d8596 100644 --- a/libs/libvpx/test/vp8_fdct4x4_test.cc +++ b/libs/libvpx/test/vp8_fdct4x4_test.cc @@ -17,12 +17,16 @@ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vpx_config.h" #include "./vp8_rtcd.h" #include "test/acm_random.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" namespace { +typedef void (*FdctFunc)(int16_t *a, int16_t *b, int a_stride); + const int cospi8sqrt2minus1 = 20091; const int sinpi8sqrt2 = 35468; @@ -68,10 +72,21 @@ void reference_idct4x4(const int16_t *input, int16_t *output) { using libvpx_test::ACMRandom; -TEST(VP8FdctTest, SignBiasCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); +class FdctTest : public ::testing::TestWithParam { + public: + virtual void SetUp() { + fdct_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + protected: + FdctFunc fdct_func_; + ACMRandom rnd_; +}; + +TEST_P(FdctTest, SignBiasCheck) { int16_t test_input_block[16]; - int16_t test_output_block[16]; + DECLARE_ALIGNED(16, int16_t, test_output_block[16]); const int pitch = 8; int count_sign_block[16][2]; const int count_test_block = 1000000; @@ -81,10 +96,10 @@ TEST(VP8FdctTest, SignBiasCheck) { for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 16; ++j) { - test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + test_input_block[j] = rnd_.Rand8() - rnd_.Rand8(); } - vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch); + fdct_func_(test_input_block, test_output_block, pitch); for (int j = 0; j < 16; ++j) { if (test_output_block[j] < 0) { @@ -110,10 +125,10 @@ TEST(VP8FdctTest, SignBiasCheck) { for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-15, 15]. for (int j = 0; j < 16; ++j) { - test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4); + test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4); } - vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch); + fdct_func_(test_input_block, test_output_block, pitch); for (int j = 0; j < 16; ++j) { if (test_output_block[j] < 0) { @@ -135,23 +150,22 @@ TEST(VP8FdctTest, SignBiasCheck) { << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]"; }; -TEST(VP8FdctTest, RoundTripErrorCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); +TEST_P(FdctTest, RoundTripErrorCheck) { int max_error = 0; double total_error = 0; const int count_test_block = 1000000; for (int i = 0; i < count_test_block; ++i) { int16_t test_input_block[16]; - int16_t test_temp_block[16]; int16_t test_output_block[16]; + DECLARE_ALIGNED(16, int16_t, test_temp_block[16]); // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 16; ++j) { - test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + test_input_block[j] = rnd_.Rand8() - rnd_.Rand8(); } const int pitch = 8; - vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch); + fdct_func_(test_input_block, test_temp_block, pitch); reference_idct4x4(test_temp_block, test_output_block); for (int j = 0; j < 16; ++j) { @@ -169,4 +183,24 @@ TEST(VP8FdctTest, RoundTripErrorCheck) { << "Error: FDCT/IDCT has average roundtrip error > 1 per block"; }; +INSTANTIATE_TEST_CASE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c)); + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, FdctTest, + ::testing::Values(vp8_short_fdct4x4_neon)); +#endif // HAVE_NEON + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, FdctTest, + ::testing::Values(vp8_short_fdct4x4_sse2)); +#endif // HAVE_SSE2 + +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P(MSA, FdctTest, + ::testing::Values(vp8_short_fdct4x4_msa)); +#endif // HAVE_MSA +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, FdctTest, + ::testing::Values(vp8_short_fdct4x4_mmi)); +#endif // HAVE_MMI } // namespace diff --git a/libs/libvpx/test/vp9_error_block_test.cc b/libs/libvpx/test/vp9_block_error_test.cc similarity index 72% rename from libs/libvpx/test/vp9_error_block_test.cc rename to libs/libvpx/test/vp9_block_error_test.cc index 74436c09e7..0b4d1df992 100644 --- a/libs/libvpx/test/vp9_error_block_test.cc +++ b/libs/libvpx/test/vp9_block_error_test.cc @@ -23,36 +23,36 @@ #include "vp9/common/vp9_entropy.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" using libvpx_test::ACMRandom; namespace { -#if CONFIG_VP9_HIGHBITDEPTH const int kNumIterations = 1000; -typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff, +typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps); + +typedef std::tr1::tuple + BlockErrorParam; + +typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, int bps); + intptr_t block_size, int64_t *ssz); -typedef std::tr1::tuple - ErrorBlockParam; - -// wrapper for 8-bit block error functions without a 'bps' param. -typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz); -template -int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, - int bps) { - EXPECT_EQ(8, bps); +template +int64_t BlockError8BitWrapper(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bps) { + EXPECT_EQ(bps, 8); return fn(coeff, dqcoeff, block_size, ssz); } -class ErrorBlockTest : public ::testing::TestWithParam { +class BlockErrorTest : public ::testing::TestWithParam { public: - virtual ~ErrorBlockTest() {} + virtual ~BlockErrorTest() {} virtual void SetUp() { error_block_op_ = GET_PARAM(0); ref_error_block_op_ = GET_PARAM(1); @@ -63,11 +63,11 @@ class ErrorBlockTest : public ::testing::TestWithParam { protected: vpx_bit_depth_t bit_depth_; - ErrorBlockFunc error_block_op_; - ErrorBlockFunc ref_error_block_op_; + HBDBlockErrorFunc error_block_op_; + HBDBlockErrorFunc ref_error_block_op_; }; -TEST_P(ErrorBlockTest, OperationCheck) { +TEST_P(BlockErrorTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); @@ -110,7 +110,7 @@ TEST_P(ErrorBlockTest, OperationCheck) { << "First failed at test case " << first_failure; } -TEST_P(ErrorBlockTest, ExtremeValues) { +TEST_P(BlockErrorTest, ExtremeValues) { ACMRandom rnd(ACMRandom::DeterministicSeed()); DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); @@ -171,29 +171,28 @@ TEST_P(ErrorBlockTest, ExtremeValues) { using std::tr1::make_tuple; #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( - SSE2, ErrorBlockTest, - ::testing::Values( - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_10), - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_12), - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_8), - make_tuple( - &HighBdBlockError8bitWrapper, - &HighBdBlockError8bitWrapper, - VPX_BITS_8))); +const BlockErrorParam sse2_block_error_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_8), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, VPX_BITS_8) +}; + +INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest, + ::testing::ValuesIn(sse2_block_error_tests)); #endif // HAVE_SSE2 -#if HAVE_AVX +#if HAVE_AVX2 INSTANTIATE_TEST_CASE_P( - AVX, ErrorBlockTest, - ::testing::Values(make_tuple( - &HighBdBlockError8bitWrapper, - &HighBdBlockError8bitWrapper, - VPX_BITS_8))); -#endif // HAVE_AVX - -#endif // CONFIG_VP9_HIGHBITDEPTH + AVX2, BlockErrorTest, + ::testing::Values(make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, + VPX_BITS_8))); +#endif // HAVE_AVX2 } // namespace diff --git a/libs/libvpx/test/vp9_denoiser_sse2_test.cc b/libs/libvpx/test/vp9_denoiser_test.cc similarity index 54% rename from libs/libvpx/test/vp9_denoiser_sse2_test.cc rename to libs/libvpx/test/vp9_denoiser_test.cc index 2a50b77355..56ca257c59 100644 --- a/libs/libvpx/test/vp9_denoiser_sse2_test.cc +++ b/libs/libvpx/test/vp9_denoiser_test.cc @@ -29,11 +29,21 @@ using libvpx_test::ACMRandom; namespace { const int kNumPixels = 64 * 64; -class VP9DenoiserTest : public ::testing::TestWithParam { + +typedef int (*Vp9DenoiserFilterFunc)(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude); +typedef std::tr1::tuple VP9DenoiserTestParam; + +class VP9DenoiserTest + : public ::testing::Test, + public ::testing::WithParamInterface { public: virtual ~VP9DenoiserTest() {} - virtual void SetUp() { bs_ = GetParam(); } + virtual void SetUp() { bs_ = GET_PARAM(1); } virtual void TearDown() { libvpx_test::ClearSystemState(); } @@ -76,9 +86,9 @@ TEST_P(VP9DenoiserTest, BitexactCheck) { 64, avg_block_c, 64, 0, bs_, motion_magnitude_random)); - ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2( - sig_block, 64, mc_avg_block, 64, avg_block_sse2, 64, 0, bs_, - motion_magnitude_random)); + ASM_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 64, mc_avg_block, 64, + avg_block_sse2, 64, 0, bs_, + motion_magnitude_random)); // Test bitexactness. for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) { @@ -89,10 +99,36 @@ TEST_P(VP9DenoiserTest, BitexactCheck) { } } +using std::tr1::make_tuple; + // Test for all block size. -INSTANTIATE_TEST_CASE_P(SSE2, VP9DenoiserTest, - ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, - BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, - BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, - BLOCK_64X64)); +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, VP9DenoiserTest, + ::testing::Values(make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X8), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X8), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X64), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X64))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, VP9DenoiserTest, + ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X8), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X64), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X64))); +#endif } // namespace diff --git a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc index 53dc8c9fe4..62e8dcb9b5 100644 --- a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc +++ b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc @@ -99,9 +99,7 @@ class VpxEncoderParmsGetToDecoder vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder(); vpx_codec_alg_priv_t *const priv = reinterpret_cast(vp9_decoder->priv); - FrameWorkerData *const worker_data = - reinterpret_cast(priv->frame_workers[0].data1); - VP9_COMMON *const common = &worker_data->pbi->common; + VP9_COMMON *const common = &priv->pbi->common; if (encode_parms.lossless) { EXPECT_EQ(0, common->base_qindex); diff --git a/libs/libvpx/test/vp9_ethread_test.cc b/libs/libvpx/test/vp9_ethread_test.cc index 0590487fc0..6b7e512116 100644 --- a/libs/libvpx/test/vp9_ethread_test.cc +++ b/libs/libvpx/test/vp9_ethread_test.cc @@ -16,17 +16,221 @@ #include "test/md5_helper.h" #include "test/util.h" #include "test/y4m_video_source.h" +#include "vp9/encoder/vp9_firstpass.h" namespace { -class VPxEncoderThreadTest +// FIRSTPASS_STATS struct: +// { +// 25 double members; +// 1 int64_t member; +// } +// Whenever FIRSTPASS_STATS struct is modified, the following constants need to +// be revisited. +const int kDbl = 25; +const int kInt = 1; +const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t); + +class VPxFirstPassEncoderThreadTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWith2Params { + protected: + VPxFirstPassEncoderThreadTest() + : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(0), + encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) { + init_flags_ = VPX_CODEC_USE_PSNR; + + row_mt_mode_ = 1; + first_pass_only_ = true; + firstpass_stats_.buf = NULL; + firstpass_stats_.sz = 0; + } + virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); } + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_max_quantizer = 56; + cfg_.rc_min_quantizer = 0; + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + encoder_initialized_ = false; + abort_ = false; + } + + virtual void EndPassHook() { + // For first pass stats test, only run first pass encoder. + if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS) + abort_ |= first_pass_only_; + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, + ::libvpx_test::Encoder *encoder) { + if (!encoder_initialized_) { + // Encode in 2-pass mode. + encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0); + + if (encoding_mode_ == ::libvpx_test::kTwoPassGood) + encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_); + + encoder_initialized_ = true; + } + } + + virtual void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) { + const uint8_t *const pkt_buf = + reinterpret_cast(pkt->data.twopass_stats.buf); + const size_t pkt_size = pkt->data.twopass_stats.sz; + + // First pass stats size equals sizeof(FIRSTPASS_STATS) + EXPECT_EQ(pkt_size, kFirstPassStatsSz) + << "Error: First pass stats size doesn't equal kFirstPassStatsSz"; + + firstpass_stats_.buf = + realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size); + memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf, + pkt_size); + firstpass_stats_.sz += pkt_size; + } + + bool encoder_initialized_; + int tiles_; + ::libvpx_test::TestMode encoding_mode_; + int set_cpu_used_; + int row_mt_mode_; + bool first_pass_only_; + vpx_fixed_buf_t firstpass_stats_; +}; + +static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) { + // fp_stats consists of 2 set of first pass encoding stats. These 2 set of + // stats are compared to check if the stats match or at least are very close. + FIRSTPASS_STATS *stats1 = reinterpret_cast(fp_stats->buf); + int nframes_ = (int)(fp_stats->sz / sizeof(FIRSTPASS_STATS)); + FIRSTPASS_STATS *stats2 = stats1 + nframes_ / 2; + int i, j; + + // The total stats are also output and included in the first pass stats. Here + // ignore that in the comparison. + for (i = 0; i < (nframes_ / 2 - 1); ++i) { + const double *frame_stats1 = reinterpret_cast(stats1); + const double *frame_stats2 = reinterpret_cast(stats2); + + for (j = 0; j < kDbl; ++j) { + ASSERT_LE(fabs(*frame_stats1 - *frame_stats2), + fabs(*frame_stats1) / factor) + << "First failure @ frame #" << i << " stat #" << j << " (" + << *frame_stats1 << " vs. " << *frame_stats2 << ")"; + frame_stats1++; + frame_stats2++; + } + + stats1++; + stats2++; + } + + // Reset firstpass_stats_ to 0. + memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz); + fp_stats->sz = 0; +} + +static void compare_fp_stats_md5(vpx_fixed_buf_t *fp_stats) { + // fp_stats consists of 2 set of first pass encoding stats. These 2 set of + // stats are compared to check if the stats match. + uint8_t *stats1 = reinterpret_cast(fp_stats->buf); + uint8_t *stats2 = stats1 + fp_stats->sz / 2; + ::libvpx_test::MD5 md5_row_mt_0, md5_row_mt_1; + + md5_row_mt_0.Add(stats1, fp_stats->sz / 2); + const char *md5_row_mt_0_str = md5_row_mt_0.Get(); + + md5_row_mt_1.Add(stats2, fp_stats->sz / 2); + const char *md5_row_mt_1_str = md5_row_mt_1.Get(); + + // Check md5 match. + ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str) + << "MD5 checksums don't match"; + + // Reset firstpass_stats_ to 0. + memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz); + fp_stats->sz = 0; +} + +TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + + first_pass_only_ = true; + cfg_.rc_target_bitrate = 1000; + + // Test row_mt_mode: 0 vs 1 at single thread case(threads = 1, tiles_ = 0) + tiles_ = 0; + cfg_.g_threads = 1; + + row_mt_mode_ = 0; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + row_mt_mode_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if using or not using row-mt generates close stats. + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0)); + + // Test single thread vs multiple threads + row_mt_mode_ = 1; + tiles_ = 0; + + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + cfg_.g_threads = 4; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if single-thread and multi-thread stats are close enough. + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0)); + + // Bit exact test in row_mt mode. + // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact + // result. + row_mt_mode_ = 1; + tiles_ = 2; + + cfg_.g_threads = 2; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + cfg_.g_threads = 8; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if stats match with row-mt=0/1. + compare_fp_stats_md5(&firstpass_stats_); +} + +class VPxEncoderThreadTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith4Params { protected: VPxEncoderThreadTest() - : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(2), + : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), + tiles_(GET_PARAM(3)), threads_(GET_PARAM(4)), encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) { init_flags_ = VPX_CODEC_USE_PSNR; md5_.clear(); + row_mt_mode_ = 1; + psnr_ = 0.0; + nframes_ = 0; } virtual ~VPxEncoderThreadTest() {} @@ -35,7 +239,6 @@ class VPxEncoderThreadTest SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { - cfg_.g_lag_in_frames = 3; cfg_.rc_end_usage = VPX_VBR; cfg_.rc_2pass_vbr_minsection_pct = 5; cfg_.rc_2pass_vbr_maxsection_pct = 2000; @@ -50,6 +253,8 @@ class VPxEncoderThreadTest virtual void BeginPassHook(unsigned int /*pass*/) { encoder_initialized_ = false; + psnr_ = 0.0; + nframes_ = 0; } virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, @@ -63,14 +268,22 @@ class VPxEncoderThreadTest encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); encoder->Control(VP8E_SET_ARNR_TYPE, 3); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0); } else { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0); encoder->Control(VP9E_SET_AQ_MODE, 3); } + encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_); + encoder_initialized_ = true; } } + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + virtual void DecompressedFrameHook(const vpx_image_t &img, vpx_codec_pts_t /*pts*/) { ::libvpx_test::MD5 md5_res; @@ -89,40 +302,127 @@ class VPxEncoderThreadTest return true; } + double GetAveragePsnr() const { return nframes_ ? (psnr_ / nframes_) : 0.0; } + bool encoder_initialized_; int tiles_; + int threads_; ::libvpx_test::TestMode encoding_mode_; int set_cpu_used_; + int row_mt_mode_; + double psnr_; + unsigned int nframes_; std::vector md5_; }; TEST_P(VPxEncoderThreadTest, EncoderResultTest) { - std::vector single_thr_md5, multi_thr_md5; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20); - cfg_.rc_target_bitrate = 1000; + // Part 1: Bit exact test for row_mt_mode_ = 0. + // This part keeps original unit tests done before row-mt code is checked in. + row_mt_mode_ = 0; + // Encode using single thread. cfg_.g_threads = 1; init_flags_ = VPX_CODEC_USE_PSNR; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - single_thr_md5 = md5_; + const std::vector single_thr_md5 = md5_; md5_.clear(); // Encode using multiple threads. - cfg_.g_threads = 4; + cfg_.g_threads = threads_; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - multi_thr_md5 = md5_; + const std::vector multi_thr_md5 = md5_; md5_.clear(); // Compare to check if two vectors are equal. ASSERT_EQ(single_thr_md5, multi_thr_md5); + + // Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test. + row_mt_mode_ = 1; + + // Encode using single thread + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector row_mt_single_thr_md5 = md5_; + md5_.clear(); + + ASSERT_EQ(single_thr_md5, row_mt_single_thr_md5); + + // Part 3: Bit exact test with row-mt on + // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact + // result. + row_mt_mode_ = 1; + row_mt_single_thr_md5.clear(); + + // Encode using 2 threads. + cfg_.g_threads = 2; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + row_mt_single_thr_md5 = md5_; + md5_.clear(); + + // Encode using multiple threads. + cfg_.g_threads = threads_; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const std::vector row_mt_multi_thr_md5 = md5_; + md5_.clear(); + + // Compare to check if two vectors are equal. + ASSERT_EQ(row_mt_single_thr_md5, row_mt_multi_thr_md5); + + // Part 4: PSNR test with bit_match_mode_ = 0 + row_mt_mode_ = 1; + + // Encode using single thread. + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double single_thr_psnr = GetAveragePsnr(); + + // Encode using multiple threads. + cfg_.g_threads = threads_; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double multi_thr_psnr = GetAveragePsnr(); + + EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.1); } -VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest, - ::testing::Values(::libvpx_test::kTwoPassGood, - ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime), - ::testing::Range(1, 9)); +INSTANTIATE_TEST_CASE_P( + VP9, VPxFirstPassEncoderThreadTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Values(::libvpx_test::kTwoPassGood), + ::testing::Range(0, 4))); // cpu_used + +// Split this into two instantiations so that we can distinguish +// between very slow runs ( ie cpu_speed 0 ) vs ones that can be +// run nightly by adding Large to the title. +INSTANTIATE_TEST_CASE_P( + VP9, VPxEncoderThreadTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Values(::libvpx_test::kTwoPassGood, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime), + ::testing::Range(3, 9), // cpu_used + ::testing::Range(0, 3), // tile_columns + ::testing::Range(2, 5))); // threads + +INSTANTIATE_TEST_CASE_P( + VP9Large, VPxEncoderThreadTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Values(::libvpx_test::kTwoPassGood, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime), + ::testing::Range(0, 3), // cpu_used + ::testing::Range(0, 3), // tile_columns + ::testing::Range(2, 5))); // threads + } // namespace diff --git a/libs/libvpx/test/vp9_frame_parallel_test.cc b/libs/libvpx/test/vp9_frame_parallel_test.cc deleted file mode 100644 index 670cd4d721..0000000000 --- a/libs/libvpx/test/vp9_frame_parallel_test.cc +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" -#include "test/codec_factory.h" -#include "test/decode_test_driver.h" -#include "test/ivf_video_source.h" -#include "test/md5_helper.h" -#include "test/util.h" -#if CONFIG_WEBM_IO -#include "test/webm_video_source.h" -#endif -#include "vpx_mem/vpx_mem.h" - -namespace { - -using std::string; - -#if CONFIG_WEBM_IO - -struct PauseFileList { - const char *name; - // md5 sum for decoded frames which does not include skipped frames. - const char *expected_md5; - const int pause_frame_num; -}; - -// Decodes |filename| with |num_threads|. Pause at the specified frame_num, -// seek to next key frame and then continue decoding until the end. Return -// the md5 of the decoded frames which does not include skipped frames. -string DecodeFileWithPause(const string &filename, int num_threads, - int pause_num) { - libvpx_test::WebMVideoSource video(filename); - video.Init(); - int in_frames = 0; - int out_frames = 0; - - vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); - cfg.threads = num_threads; - vpx_codec_flags_t flags = 0; - flags |= VPX_CODEC_USE_FRAME_THREADING; - libvpx_test::VP9Decoder decoder(cfg, flags); - - libvpx_test::MD5 md5; - video.Begin(); - - do { - ++in_frames; - const vpx_codec_err_t res = - decoder.DecodeFrame(video.cxdata(), video.frame_size()); - if (res != VPX_CODEC_OK) { - EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); - break; - } - - // Pause at specified frame number. - if (in_frames == pause_num) { - // Flush the decoder and then seek to next key frame. - decoder.DecodeFrame(NULL, 0); - video.SeekToNextKeyFrame(); - } else { - video.Next(); - } - - // Flush the decoder at the end of the video. - if (!video.cxdata()) decoder.DecodeFrame(NULL, 0); - - libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img; - - // Get decompressed data - while ((img = dec_iter.Next())) { - ++out_frames; - md5.Add(img); - } - } while (video.cxdata() != NULL); - - EXPECT_EQ(in_frames, out_frames) - << "Input frame count does not match output frame count"; - - return string(md5.Get()); -} - -void DecodeFilesWithPause(const PauseFileList files[]) { - for (const PauseFileList *iter = files; iter->name != NULL; ++iter) { - SCOPED_TRACE(iter->name); - for (int t = 2; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, - DecodeFileWithPause(iter->name, t, iter->pause_frame_num)) - << "threads = " << t; - } - } -} - -TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) { - // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with - // one key frame for every ten frames. - static const PauseFileList files[] = { - { "vp90-2-07-frame_parallel-1.webm", "6ea7c3875d67252e7caf2bc6e75b36b1", - 6 }, - { "vp90-2-07-frame_parallel-1.webm", "4bb634160c7356a8d7d4299b6dc83a45", - 12 }, - { "vp90-2-07-frame_parallel-1.webm", "89772591e6ef461f9fa754f916c78ed8", - 26 }, - { NULL, NULL, 0 }, - }; - DecodeFilesWithPause(files); -} - -struct FileList { - const char *name; - // md5 sum for decoded frames which does not include corrupted frames. - const char *expected_md5; - // Expected number of decoded frames which does not include corrupted frames. - const int expected_frame_count; -}; - -// Decodes |filename| with |num_threads|. Return the md5 of the decoded -// frames which does not include corrupted frames. -string DecodeFile(const string &filename, int num_threads, - int expected_frame_count) { - libvpx_test::WebMVideoSource video(filename); - video.Init(); - - vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); - cfg.threads = num_threads; - const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING; - libvpx_test::VP9Decoder decoder(cfg, flags); - - libvpx_test::MD5 md5; - video.Begin(); - - int out_frames = 0; - do { - const vpx_codec_err_t res = - decoder.DecodeFrame(video.cxdata(), video.frame_size()); - // TODO(hkuang): frame parallel mode should return an error on corruption. - if (res != VPX_CODEC_OK) { - EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); - break; - } - - video.Next(); - - // Flush the decoder at the end of the video. - if (!video.cxdata()) decoder.DecodeFrame(NULL, 0); - - libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img; - - // Get decompressed data - while ((img = dec_iter.Next())) { - ++out_frames; - md5.Add(img); - } - } while (video.cxdata() != NULL); - - EXPECT_EQ(expected_frame_count, out_frames) - << "Input frame count does not match expected output frame count"; - - return string(md5.Get()); -} - -void DecodeFiles(const FileList files[]) { - for (const FileList *iter = files; iter->name != NULL; ++iter) { - SCOPED_TRACE(iter->name); - for (int t = 2; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, - DecodeFile(iter->name, t, iter->expected_frame_count)) - << "threads = " << t; - } - } -} - -TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) { - static const FileList files[] = { - // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with - // one key frame for every ten frames. The 11th frame has corrupted data. - { "invalid-vp90-2-07-frame_parallel-1.webm", - "0549d0f45f60deaef8eb708e6c0eb6cb", 30 }, - // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with - // one key frame for every ten frames. The 1st and 31st frames have - // corrupted data. - { "invalid-vp90-2-07-frame_parallel-2.webm", - "6a1f3cf6f9e7a364212fadb9580d525e", 20 }, - // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with - // one key frame for every ten frames. The 5th and 13th frames have - // corrupted data. - { "invalid-vp90-2-07-frame_parallel-3.webm", - "8256544308de926b0681e04685b98677", 27 }, - { NULL, NULL, 0 }, - }; - DecodeFiles(files); -} - -TEST(VP9MultiThreadedFrameParallel, ValidFileTest) { - static const FileList files[] = { -#if CONFIG_VP9_HIGHBITDEPTH - { "vp92-2-20-10bit-yuv420.webm", "a16b99df180c584e8db2ffeda987d293", 10 }, -#endif - { NULL, NULL, 0 }, - }; - DecodeFiles(files); -} -#endif // CONFIG_WEBM_IO -} // namespace diff --git a/libs/libvpx/test/vp9_intrapred_test.cc b/libs/libvpx/test/vp9_intrapred_test.cc index 1c4a3392e4..39c5e79ebd 100644 --- a/libs/libvpx/test/vp9_intrapred_test.cc +++ b/libs/libvpx/test/vp9_intrapred_test.cc @@ -28,25 +28,25 @@ using libvpx_test::ACMRandom; const int count_test_block = 100000; -typedef void (*IntraPred)(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, int bps); +typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); -struct IntraPredFunc { - IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL, - int block_size_value = 0, int bit_depth_value = 0) +struct IntraPredParam { + IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL, + int block_size_value = 0, int bit_depth_value = 0) : pred_fn(pred), ref_fn(ref), block_size(block_size_value), bit_depth(bit_depth_value) {} - IntraPred pred_fn; - IntraPred ref_fn; + IntraPredFunc pred_fn; + IntraPredFunc ref_fn; int block_size; int bit_depth; }; -class VP9IntraPredTest : public ::testing::TestWithParam { +template +class IntraPredTest : public ::testing::TestWithParam { public: - void RunTest(uint16_t *left_col, uint16_t *above_data, uint16_t *dst, - uint16_t *ref_dst) { + void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int block_size = params_.block_size; above_row_ = above_data + 16; @@ -56,13 +56,16 @@ class VP9IntraPredTest : public ::testing::TestWithParam { int error_count = 0; for (int i = 0; i < count_test_block; ++i) { // Fill edges with random data, try first with saturated values. - for (int x = -1; x <= block_size * 2; x++) { + for (int x = -1; x < block_size; x++) { if (i == 0) { above_row_[x] = mask_; } else { above_row_[x] = rnd.Rand16() & mask_; } } + for (int x = block_size; x < 2 * block_size; x++) { + above_row_[x] = above_row_[block_size - 1]; + } for (int y = 0; y < block_size; y++) { if (i == 0) { left_col_[y] = mask_; @@ -78,17 +81,12 @@ class VP9IntraPredTest : public ::testing::TestWithParam { protected: virtual void SetUp() { - params_ = GetParam(); + params_ = this->GetParam(); stride_ = params_.block_size * 3; mask_ = (1 << params_.bit_depth) - 1; } - void Predict() { - const int bit_depth = params_.bit_depth; - params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth); - ASM_REGISTER_STATE_CHECK( - params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth)); - } + void Predict(); void CheckPrediction(int test_case_number, int *error_count) const { // For each pixel ensure that the calculated value is the same as reference. @@ -104,17 +102,363 @@ class VP9IntraPredTest : public ::testing::TestWithParam { } } - uint16_t *above_row_; - uint16_t *left_col_; - uint16_t *dst_; - uint16_t *ref_dst_; + Pixel *above_row_; + Pixel *left_col_; + Pixel *dst_; + Pixel *ref_dst_; ptrdiff_t stride_; int mask_; - IntraPredFunc params_; + PredParam params_; }; +template <> +void IntraPredTest::Predict() { + params_.ref_fn(ref_dst_, stride_, above_row_, left_col_); + ASM_REGISTER_STATE_CHECK( + params_.pred_fn(dst_, stride_, above_row_, left_col_)); +} + +typedef IntraPredTest VP9IntraPredTest; + TEST_P(VP9IntraPredTest, IntraPredTests) { + // max block size is 32 + DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]); + DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]); + DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]); + DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]); + RunTest(left_col, above_data, dst, ref_dst); +} + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d45_predictor_8x8_sse2, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d207_predictor_4x4_sse2, &vpx_d207_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_dc_128_predictor_4x4_sse2, + &vpx_dc_128_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_128_predictor_8x8_sse2, + &vpx_dc_128_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_sse2, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_sse2, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_4x4_sse2, + &vpx_dc_left_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_left_predictor_8x8_sse2, + &vpx_dc_left_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_sse2, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_sse2, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_4x4_sse2, &vpx_dc_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_sse2, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_sse2, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_sse2, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_4x4_sse2, + &vpx_dc_top_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_top_predictor_8x8_sse2, + &vpx_dc_top_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_sse2, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_sse2, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_sse2, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_sse2, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_sse2, &vpx_h_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_sse2, &vpx_h_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_tm_predictor_4x4_sse2, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_sse2, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_sse2, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_sse2, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_4x4_sse2, &vpx_v_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_v_predictor_8x8_sse2, &vpx_v_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_v_predictor_16x16_sse2, &vpx_v_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_sse2, &vpx_v_predictor_32x32_c, + 32, 8))); +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_ssse3, + &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_4x4_ssse3, + &vpx_d63_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_d63_predictor_8x8_ssse3, + &vpx_d63_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d63_predictor_16x16_ssse3, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_ssse3, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d153_predictor_4x4_ssse3, + &vpx_d153_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_d153_predictor_8x8_ssse3, + &vpx_d153_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d153_predictor_16x16_ssse3, + &vpx_d153_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d153_predictor_32x32_ssse3, + &vpx_d153_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d207_predictor_8x8_ssse3, + &vpx_d207_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_d207_predictor_16x16_ssse3, + &vpx_d207_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d207_predictor_32x32_ssse3, + &vpx_d207_predictor_32x32_c, 32, 8))); +#endif // HAVE_SSSE3 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d45_predictor_8x8_neon, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d45_predictor_16x16_neon, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_neon, + &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d135_predictor_16x16_neon, + &vpx_d135_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d135_predictor_32x32_neon, + &vpx_d135_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_128_predictor_4x4_neon, + &vpx_dc_128_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_128_predictor_8x8_neon, + &vpx_dc_128_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_neon, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_neon, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_4x4_neon, + &vpx_dc_left_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_left_predictor_8x8_neon, + &vpx_dc_left_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_neon, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_neon, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_4x4_neon, &vpx_dc_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_neon, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_neon, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_neon, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_4x4_neon, + &vpx_dc_top_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_top_predictor_8x8_neon, + &vpx_dc_top_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_neon, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_neon, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_neon, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_neon, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_neon, &vpx_h_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_neon, &vpx_h_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_tm_predictor_4x4_neon, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_neon, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_neon, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_neon, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_4x4_neon, &vpx_v_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_v_predictor_8x8_neon, &vpx_v_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_v_predictor_16x16_neon, &vpx_v_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_neon, &vpx_v_predictor_32x32_c, + 32, 8))); +#endif // HAVE_NEON + +#if HAVE_DSPR2 +INSTANTIATE_TEST_CASE_P( + DSPR2, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_dc_predictor_4x4_dspr2, + &vpx_dc_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_predictor_8x8_dspr2, + &vpx_dc_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_predictor_16x16_dspr2, + &vpx_dc_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_h_predictor_4x4_dspr2, + &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_dspr2, + &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_dspr2, + &vpx_h_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_tm_predictor_4x4_dspr2, + &vpx_tm_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_tm_predictor_8x8_dspr2, + &vpx_tm_predictor_8x8_c, 8, 8))); +#endif // HAVE_DSPR2 + +#if HAVE_MSA +INSTANTIATE_TEST_CASE_P( + MSA, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_dc_128_predictor_4x4_msa, + &vpx_dc_128_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_128_predictor_8x8_msa, + &vpx_dc_128_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_msa, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_msa, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_4x4_msa, + &vpx_dc_left_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_left_predictor_8x8_msa, + &vpx_dc_left_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_msa, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_msa, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_4x4_msa, &vpx_dc_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_msa, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_msa, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_msa, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_4x4_msa, + &vpx_dc_top_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_dc_top_predictor_8x8_msa, + &vpx_dc_top_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_msa, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_msa, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_msa, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_msa, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_msa, &vpx_h_predictor_16x16_c, 16, + 8), + IntraPredParam(&vpx_h_predictor_32x32_msa, &vpx_h_predictor_32x32_c, 32, + 8), + IntraPredParam(&vpx_tm_predictor_4x4_msa, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_msa, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_msa, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_msa, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_4x4_msa, &vpx_v_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_v_predictor_8x8_msa, &vpx_v_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_v_predictor_16x16_msa, &vpx_v_predictor_16x16_c, 16, + 8), + IntraPredParam(&vpx_v_predictor_32x32_msa, &vpx_v_predictor_32x32_c, 32, + 8))); +#endif // HAVE_MSA + +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P( + VSX, VP9IntraPredTest, + ::testing::Values( + IntraPredParam(&vpx_d45_predictor_8x8_vsx, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d45_predictor_16x16_vsx, &vpx_d45_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_vsx, &vpx_d45_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_d63_predictor_8x8_vsx, &vpx_d63_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d63_predictor_16x16_vsx, &vpx_d63_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_vsx, &vpx_d63_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_vsx, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_vsx, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_vsx, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_vsx, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_8x8_vsx, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_16x16_vsx, &vpx_dc_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_vsx, &vpx_dc_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_vsx, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_vsx, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_4x4_vsx, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_vsx, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_h_predictor_16x16_vsx, &vpx_h_predictor_16x16_c, 16, + 8), + IntraPredParam(&vpx_h_predictor_32x32_vsx, &vpx_h_predictor_32x32_c, 32, + 8), + IntraPredParam(&vpx_tm_predictor_4x4_vsx, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_vsx, &vpx_tm_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_tm_predictor_16x16_vsx, &vpx_tm_predictor_16x16_c, + 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_vsx, &vpx_tm_predictor_32x32_c, + 32, 8), + IntraPredParam(&vpx_v_predictor_16x16_vsx, &vpx_v_predictor_16x16_c, 16, + 8), + IntraPredParam(&vpx_v_predictor_32x32_vsx, &vpx_v_predictor_32x32_c, 32, + 8))); +#endif // HAVE_VSX + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bps); +struct HighbdIntraPredParam { + HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL, + int block_size_value = 0, int bit_depth_value = 0) + : pred_fn(pred), ref_fn(ref), block_size(block_size_value), + bit_depth(bit_depth_value) {} + + HighbdIntraPred pred_fn; + HighbdIntraPred ref_fn; + int block_size; + int bit_depth; +}; + +template <> +void IntraPredTest::Predict() { + const int bit_depth = params_.bit_depth; + params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth); + ASM_REGISTER_STATE_CHECK( + params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth)); +} + +typedef IntraPredTest VP9HighbdIntraPredTest; + +TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { // max block size is 32 DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]); DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]); @@ -123,89 +467,575 @@ TEST_P(VP9IntraPredTest, IntraPredTests) { RunTest(left_col, above_data, dst, ref_dst); } +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 8))); + +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 10))); + +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 12))); +#endif // HAVE_SSSE3 + #if HAVE_SSE2 -#if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( - SSE2_TO_C_8, VP9IntraPredTest, - ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2, - &vpx_highbd_dc_predictor_32x32_c, 32, 8), - IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2, - &vpx_highbd_tm_predictor_16x16_c, 16, 8), - IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2, - &vpx_highbd_tm_predictor_32x32_c, 32, 8), - IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2, - &vpx_highbd_dc_predictor_4x4_c, 4, 8), - IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 8), - IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, 8), - IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2, - &vpx_highbd_v_predictor_4x4_c, 4, 8), - IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 8), - IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 8), - IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 8), - IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2, - &vpx_highbd_tm_predictor_4x4_c, 4, 8), - IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 8))); + SSE2_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 8))); INSTANTIATE_TEST_CASE_P( - SSE2_TO_C_10, VP9IntraPredTest, - ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2, - &vpx_highbd_dc_predictor_32x32_c, 32, 10), - IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2, - &vpx_highbd_tm_predictor_16x16_c, 16, 10), - IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2, - &vpx_highbd_tm_predictor_32x32_c, 32, 10), - IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2, - &vpx_highbd_dc_predictor_4x4_c, 4, 10), - IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 10), - IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, 10), - IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2, - &vpx_highbd_v_predictor_4x4_c, 4, 10), - IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 10), - IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 10), - IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 10), - IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2, - &vpx_highbd_tm_predictor_4x4_c, 4, 10), - IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 10))); + SSE2_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 10))); INSTANTIATE_TEST_CASE_P( - SSE2_TO_C_12, VP9IntraPredTest, - ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2, - &vpx_highbd_dc_predictor_32x32_c, 32, 12), - IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2, - &vpx_highbd_tm_predictor_16x16_c, 16, 12), - IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2, - &vpx_highbd_tm_predictor_32x32_c, 32, 12), - IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2, - &vpx_highbd_dc_predictor_4x4_c, 4, 12), - IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 12), - IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, 12), - IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2, - &vpx_highbd_v_predictor_4x4_c, 4, 12), - IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 12), - IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 12), - IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 12), - IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2, - &vpx_highbd_tm_predictor_4x4_c, 4, 12), - IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 12))); + SSE2_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, + &vpx_highbd_dc_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, + &vpx_highbd_dc_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2, + &vpx_highbd_dc_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, + &vpx_highbd_dc_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, + &vpx_highbd_tm_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, + &vpx_highbd_tm_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2, + &vpx_highbd_tm_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, + &vpx_highbd_tm_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, + &vpx_highbd_v_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, + &vpx_highbd_v_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2, + &vpx_highbd_v_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, + &vpx_highbd_v_predictor_32x32_c, 32, 12))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon, + &vpx_highbd_dc_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon, + &vpx_highbd_dc_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon, + &vpx_highbd_dc_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon, + &vpx_highbd_dc_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 8))); + +INSTANTIATE_TEST_CASE_P( + NEON_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon, + &vpx_highbd_dc_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon, + &vpx_highbd_dc_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon, + &vpx_highbd_dc_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon, + &vpx_highbd_dc_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 10))); + +INSTANTIATE_TEST_CASE_P( + NEON_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon, + &vpx_highbd_dc_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon, + &vpx_highbd_dc_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon, + &vpx_highbd_dc_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon, + &vpx_highbd_dc_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon, + &vpx_highbd_h_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon, + &vpx_highbd_h_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon, + &vpx_highbd_h_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, + &vpx_highbd_h_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, + &vpx_highbd_v_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, + &vpx_highbd_v_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon, + &vpx_highbd_v_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, + &vpx_highbd_v_predictor_32x32_c, 32, 12))); +#endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_SSE2 } // namespace diff --git a/libs/libvpx/test/vp9_motion_vector_test.cc b/libs/libvpx/test/vp9_motion_vector_test.cc new file mode 100644 index 0000000000..1030204ae3 --- /dev/null +++ b/libs/libvpx/test/vp9_motion_vector_test.cc @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/yuv_video_source.h" + +namespace { +#define MAX_EXTREME_MV 1 +#define MIN_EXTREME_MV 2 + +// Encoding modes +const libvpx_test::TestMode kEncodingModeVectors[] = { + ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, + ::libvpx_test::kRealTime, +}; + +// Encoding speeds +const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6 }; + +// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV. +const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV }; + +class MotionVectorTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith3Params { + protected: + MotionVectorTestLarge() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {} + + virtual ~MotionVectorTestLarge() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + cfg_.g_lag_in_frames = 3; + cfg_.rc_end_usage = VPX_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + } + + libvpx_test::TestMode encoding_mode_; + int cpu_used_; + int mv_test_mode_; +}; + +TEST_P(MotionVectorTestLarge, OverallTest) { + cfg_.rc_target_bitrate = 24000; + cfg_.g_profile = 0; + init_flags_ = VPX_CODEC_USE_PSNR; + + testing::internal::scoped_ptr video; + video.reset(new libvpx_test::YUVVideoSource( + "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160, // 2048, 1080, + 30, 1, 0, 5)); + + ASSERT_TRUE(video.get() != NULL); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +VP9_INSTANTIATE_TEST_CASE(MotionVectorTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kCpuUsedVectors), + ::testing::ValuesIn(kMVTestModes)); +} // namespace diff --git a/libs/libvpx/test/vp9_quantize_test.cc b/libs/libvpx/test/vp9_quantize_test.cc index 4643895021..b18d4522ce 100644 --- a/libs/libvpx/test/vp9_quantize_test.cc +++ b/libs/libvpx/test/vp9_quantize_test.cc @@ -14,9 +14,11 @@ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" @@ -24,11 +26,12 @@ #include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; +using libvpx_test::Buffer; namespace { -#if CONFIG_VP9_HIGHBITDEPTH const int number_of_iterations = 100; typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, @@ -38,307 +41,494 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, const int16_t *scan, const int16_t *iscan); -typedef std::tr1::tuple +typedef std::tr1::tuple QuantizeParam; -class VP9QuantizeTest : public ::testing::TestWithParam { +// Wrapper for FP version which does not use zbin or quant_shift. +typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count, + int skip_block, const int16_t *round, + const int16_t *quant, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, + uint16_t *eob, const int16_t *scan, + const int16_t *iscan); + +template +void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block, + const int16_t *zbin, const int16_t *round, + const int16_t *quant, const int16_t *quant_shift, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, const int16_t *scan, + const int16_t *iscan) { + (void)zbin; + (void)quant_shift; + + fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob, + scan, iscan); +} + +class VP9QuantizeBase { public: - virtual ~VP9QuantizeTest() {} - virtual void SetUp() { - quantize_op_ = GET_PARAM(0); - ref_quantize_op_ = GET_PARAM(1); - bit_depth_ = GET_PARAM(2); - mask_ = (1 << bit_depth_) - 1; + VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) + : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) { + max_value_ = (1 << bit_depth_) - 1; + zbin_ptr_ = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); + round_fp_ptr_ = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); + quant_fp_ptr_ = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); + round_ptr_ = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*round_ptr_))); + quant_ptr_ = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); + quant_shift_ptr_ = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); + dequant_ptr_ = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - vpx_bit_depth_t bit_depth_; - int mask_; - QuantizeFunc quantize_op_; - QuantizeFunc ref_quantize_op_; -}; - -class VP9Quantize32Test : public ::testing::TestWithParam { - public: - virtual ~VP9Quantize32Test() {} - virtual void SetUp() { - quantize_op_ = GET_PARAM(0); - ref_quantize_op_ = GET_PARAM(1); - bit_depth_ = GET_PARAM(2); - mask_ = (1 << bit_depth_) - 1; + ~VP9QuantizeBase() { + vpx_free(zbin_ptr_); + vpx_free(round_fp_ptr_); + vpx_free(quant_fp_ptr_); + vpx_free(round_ptr_); + vpx_free(quant_ptr_); + vpx_free(quant_shift_ptr_); + vpx_free(dequant_ptr_); + zbin_ptr_ = NULL; + round_fp_ptr_ = NULL; + quant_fp_ptr_ = NULL; + round_ptr_ = NULL; + quant_ptr_ = NULL; + quant_shift_ptr_ = NULL; + dequant_ptr_ = NULL; + libvpx_test::ClearSystemState(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + protected: + int16_t *zbin_ptr_; + int16_t *round_fp_ptr_; + int16_t *quant_fp_ptr_; + int16_t *round_ptr_; + int16_t *quant_ptr_; + int16_t *quant_shift_ptr_; + int16_t *dequant_ptr_; + const vpx_bit_depth_t bit_depth_; + int max_value_; + const int max_size_; + const bool is_fp_; +}; + +class VP9QuantizeTest : public VP9QuantizeBase, + public ::testing::TestWithParam { + public: + VP9QuantizeTest() + : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)), + quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {} protected: - vpx_bit_depth_t bit_depth_; - int mask_; - QuantizeFunc quantize_op_; - QuantizeFunc ref_quantize_op_; + const QuantizeFunc quantize_op_; + const QuantizeFunc ref_quantize_op_; }; +// This quantizer compares the AC coefficients to the quantization step size to +// determine if further multiplication operations are needed. +// Based on vp9_quantize_fp_sse2(). +void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + int i, eob = -1; + const int thr = dequant_ptr[1] >> 1; + (void)iscan; + (void)skip_block; + assert(!skip_block); + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i += 16) { + int y; + int nzflag_cnt = 0; + int abs_coeff[16]; + int coeff_sign[16]; + + // count nzflag for each row (16 tran_low_t) + for (y = 0; y < 16; ++y) { + const int rc = i + y; + const int coeff = coeff_ptr[rc]; + coeff_sign[y] = (coeff >> 31); + abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y]; + // The first 16 are skipped in the sse2 code. Do the same here to match. + if (i >= 16 && (abs_coeff[y] <= thr)) { + nzflag_cnt++; + } + } + + for (y = 0; y < 16; ++y) { + const int rc = i + y; + // If all of the AC coeffs in a row has magnitude less than the + // quantization step_size/2, quantize to zero. + if (nzflag_cnt < 16) { + int tmp = + clamp(abs_coeff[y] + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y]; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } + + // Scan for eob. + for (i = 0; i < n_coeffs; i++) { + // Use the scan order to find the correct eob. + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, + int16_t *dequant, int16_t *round_fp, + int16_t *quant_fp) { + // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. + const int max_qrounding_factor_fp = 64; + + for (int j = 0; j < 2; j++) { + // The range is 4 to 1828 in the VP9 tables. + const int qlookup = rnd->RandRange(1825) + 4; + round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7; + quant_fp[j] = (1 << 16) / qlookup; + + // Values determined by deconstructing vp9_init_quantizer(). + // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y + // values or U/V values of any bit depth. This is because y_delta is not + // factored into the vp9_ac_quant() call. + zbin[j] = rnd->RandRange(1200); + + // round may be up to 685 for Y values or 914 for U/V. + round[j] = rnd->RandRange(914); + // quant ranges from 1 to -32703 + quant[j] = static_cast(rnd->RandRange(32704)) - 32703; + // quant_shift goes up to 1 << 16. + quant_shift[j] = rnd->RandRange(16384); + // dequant maxes out at 1828 for all cases. + dequant[j] = rnd->RandRange(1828); + } + for (int j = 2; j < 8; j++) { + zbin[j] = zbin[1]; + round_fp[j] = round_fp[1]; + quant_fp[j] = quant_fp[1]; + round[j] = round[1]; + quant[j] = quant[1]; + quant_shift[j] = quant_shift[1]; + dequant[j] = dequant[1]; + } +} + TEST_P(VP9QuantizeTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; - for (int i = 0; i < number_of_iterations; ++i) { - const int skip_block = i == 0; - const TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16 - const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); // 16, 64, 256 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = rnd.Rand16() & mask_; - } - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, - scan_order->scan, scan_order->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); - } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; - } - err_count_total += err_count; - } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; -} + Buffer coeff = Buffer(max_size_, max_size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + Buffer qcoeff = Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(qcoeff.Init()); + Buffer dqcoeff = Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(dqcoeff.Init()); + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t eob, ref_eob; -TEST_P(VP9Quantize32Test, OperationCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; for (int i = 0; i < number_of_iterations; ++i) { - const int skip_block = i == 0; - const TX_SIZE sz = TX_32X32; - const TX_TYPE tx_type = (TX_TYPE)(i % 4); + // Test skip block for the first three iterations to catch all the different + // sizes. + const int skip_block = 0; + TX_SIZE sz; + if (max_size_ == 16) { + sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 + } else { + sz = TX_32X32; + } + const TX_TYPE tx_type = static_cast((i >> 2) % 3); const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); // 1024 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = rnd.Rand16() & mask_; - } - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, + const int count = (4 << sz) * (4 << sz); + coeff.Set(&rnd, -max_value_, max_value_); + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; + int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; + ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, + q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_order->scan, scan_order->iscan); + ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, + quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), + dequant_ptr_, &eob, scan_order->scan, scan_order->iscan)); + + EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff)); + + EXPECT_EQ(eob, ref_eob); + + if (HasFailure()) { + printf("Failure on iteration %d.\n", i); + qcoeff.PrintDifference(ref_qcoeff); + dqcoeff.PrintDifference(ref_dqcoeff); + return; } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; - } - err_count_total += err_count; } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; } TEST_P(VP9QuantizeTest, EOBCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; + Buffer coeff = Buffer(max_size_, max_size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + Buffer qcoeff = Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(qcoeff.Init()); + Buffer dqcoeff = Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(dqcoeff.Init()); + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t eob, ref_eob; + for (int i = 0; i < number_of_iterations; ++i) { - int skip_block = i == 0; - TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16 - TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3); + const int skip_block = 0; + TX_SIZE sz; + if (max_size_ == 16) { + sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 + } else { + sz = TX_32X32; + } + const TX_TYPE tx_type = static_cast((i >> 2) % 3); const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - int count = (4 << sz) * (4 << sz); // 16, 64, 256 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; + int count = (4 << sz) * (4 << sz); // Two random entries - for (int j = 0; j < count; j++) { - coeff_ptr[j] = 0; - } - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } - - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, + coeff.Set(0); + coeff.TopLeftPixel()[rnd(count)] = + static_cast(rnd.RandRange(max_value_ * 2)) - max_value_; + coeff.TopLeftPixel()[rnd(count)] = + static_cast(rnd.RandRange(max_value_ * 2)) - max_value_; + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; + int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; + ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, + q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, scan_order->scan, scan_order->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + ASM_REGISTER_STATE_CHECK(quantize_op_( + coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, + quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), + dequant_ptr_, &eob, scan_order->scan, scan_order->iscan)); + + EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff)); + + EXPECT_EQ(eob, ref_eob); + + if (HasFailure()) { + printf("Failure on iteration %d.\n", i); + qcoeff.PrintDifference(ref_qcoeff); + dqcoeff.PrintDifference(ref_dqcoeff); + return; } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; - } - err_count_total += err_count; } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; } -TEST_P(VP9Quantize32Test, EOBCheck) { +TEST_P(VP9QuantizeTest, DISABLED_Speed) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; - for (int i = 0; i < number_of_iterations; ++i) { - int skip_block = i == 0; - TX_SIZE sz = TX_32X32; - TX_TYPE tx_type = (TX_TYPE)(i % 4); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - int count = (4 << sz) * (4 << sz); // 1024 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = 0; - } - // Two random entries - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } + Buffer coeff = Buffer(max_size_, max_size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + Buffer qcoeff = Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(qcoeff.Init()); + Buffer dqcoeff = Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(dqcoeff.Init()); + uint16_t eob; + TX_SIZE starting_sz, ending_sz; - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, - scan_order->scan, scan_order->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); - - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); - } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; - } - err_count_total += err_count; + if (max_size_ == 16) { + starting_sz = TX_4X4; + ending_sz = TX_16X16; + } else { + starting_sz = TX_32X32; + ending_sz = TX_32X32; + } + + for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { + // zbin > coeff, zbin < coeff. + for (int i = 0; i < 2; ++i) { + const int skip_block = 0; + // TX_TYPE defines the scan order. That is not relevant to the speed test. + // Pick the first one. + const TX_TYPE tx_type = DCT_DCT; + const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; + const int count = (4 << sz) * (4 << sz); + + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; + int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; + + if (i == 0) { + // When |coeff values| are less than zbin the results are 0. + int threshold = 100; + if (max_size_ == 32) { + // For 32x32, the threshold is halved. Double it to keep the values + // from clearing it. + threshold = 200; + } + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; + coeff.Set(&rnd, -99, 99); + } else if (i == 1) { + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; + coeff.Set(&rnd, -500, 500); + } + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int j = 0; j < 100000000 / count; ++j) { + quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, + q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(), + dqcoeff.TopLeftPixel(), dequant_ptr_, &eob, + scan_order->scan, scan_order->iscan); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + if (i == 0) printf("Bypass calculations.\n"); + if (i == 1) printf("Full calculations.\n"); + printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz, + elapsed_time / 1000); + } + printf("\n"); } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; } + using std::tr1::make_tuple; #if HAVE_SSE2 +#if CONFIG_VP9_HIGHBITDEPTH +// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds. +// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8), INSTANTIATE_TEST_CASE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_8), - make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_10), - make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_12))); + ::testing::Values( + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_sse2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); + +#else INSTANTIATE_TEST_CASE_P( - SSE2, VP9Quantize32Test, - ::testing::Values(make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12))); -#endif // HAVE_SSE2 + SSE2, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true))); #endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true))); +#else +INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false))); +#endif + +#if ARCH_X86_64 +// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test. +INSTANTIATE_TEST_CASE_P( + DISABLED_SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // ARCH_X86_64 +#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH + +// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or +// highbitdepth configurations. +#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + // Even though SSSE3 and AVX do not match the reference + // code, we can keep them in sync with each other. + make_tuple(&vpx_quantize_b_32x32_avx, + &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32, + false))); +#endif // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH + +// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds. +#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + NEON, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_neon, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH + +// Only useful to compare "Speed" test results. +INSTANTIATE_TEST_CASE_P( + DISABLED_C, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8, + 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true))); } // namespace diff --git a/libs/libvpx/test/vp9_scale_test.cc b/libs/libvpx/test/vp9_scale_test.cc new file mode 100644 index 0000000000..5d7d38e89a --- /dev/null +++ b/libs/libvpx/test/vp9_scale_test.cc @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/vpx_scale_test.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/yv12config.h" + +namespace libvpx_test { + +typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler); + +class ScaleTest : public VpxScaleBase, + public ::testing::TestWithParam { + public: + virtual ~ScaleTest() {} + + protected: + virtual void SetUp() { scale_fn_ = GetParam(); } + + void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { + vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler); + } + + void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { + ASM_REGISTER_STATE_CHECK( + scale_fn_(&img_, &dst_img_, filter_type, phase_scaler)); + } + + void RunTest() { + static const int kNumSizesToTest = 20; + static const int kNumScaleFactorsToTest = 4; + static const int kSizesToTest[] = { + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, + 22, 24, 26, 28, 30, 32, 34, 68, 128, 134 + }; + static const int kScaleFactors[] = { 1, 2, 3, 4 }; + for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { + for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { + for (int h = 0; h < kNumSizesToTest; ++h) { + const int src_height = kSizesToTest[h]; + for (int w = 0; w < kNumSizesToTest; ++w) { + const int src_width = kSizesToTest[w]; + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; + ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; + } + // I420 frame width and height must be even. + if (!dst_width || !dst_height || dst_width & 1 || + dst_height & 1) { + continue; + } + // vpx_convolve8_c() has restriction on the step which cannot + // exceed 64 (ratio 1 to 4). + if (src_width > 4 * dst_width || src_height > 4 * dst_height) { + continue; + } + ASSERT_NO_FATAL_FAILURE(ResetScaleImages( + src_width, src_height, dst_width, dst_height)); + ReferenceScaleFrame(filter_type, phase_scaler); + ScaleFrame(filter_type, phase_scaler); + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d\n", + filter_type, phase_scaler, src_width, src_height, + dst_width, dst_height, sf_down, sf_up); + PrintDiff(); + } + CompareImages(dst_img_); + DeallocScaleImages(); + } + } + } + } + } + } + } + + void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt, + const int stride, const int width, const int height, + const int plane_idx) const { + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (ref[y * stride + x] != opt[y * stride + x]) { + printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx, + y, x, ref[y * stride + x], opt[y * stride + x]); + break; + } + } + } + } + + void PrintDiff() const { + assert(ref_img_.y_stride == dst_img_.y_stride); + assert(ref_img_.y_width == dst_img_.y_width); + assert(ref_img_.y_height == dst_img_.y_height); + assert(ref_img_.uv_stride == dst_img_.uv_stride); + assert(ref_img_.uv_width == dst_img_.uv_width); + assert(ref_img_.uv_height == dst_img_.uv_height); + + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer, + ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height, + 0); + PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer, + ref_img_.uv_stride, ref_img_.uv_width, + ref_img_.uv_height, 1); + PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer, + ref_img_.uv_stride, ref_img_.uv_width, + ref_img_.uv_height, 2); + } + } + + ScaleFrameFunc scale_fn_; +}; + +TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } + +TEST_P(ScaleTest, DISABLED_Speed) { + static const int kCountSpeedTestBlock = 100; + static const int kNumScaleFactorsToTest = 4; + static const int kScaleFactors[] = { 1, 2, 3, 4 }; + const int src_width = 1280; + const int src_height = 720; + for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) { + for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) { + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; + } + // I420 frame width and height must be even. + if (dst_width & 1 || dst_height & 1) { + continue; + } + ASSERT_NO_FATAL_FAILURE( + ResetScaleImages(src_width, src_height, dst_width, dst_height)); + ASM_REGISTER_STATE_CHECK( + ReferenceScaleFrame(filter_type, phase_scaler)); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + ScaleFrame(filter_type, phase_scaler); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + CompareImages(dst_img_); + DeallocScaleImages(); + + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d, scale time: %5d ms\n", + filter_type, phase_scaler, src_width, src_height, dst_width, + dst_height, sf_down, sf_up, elapsed_time); + } + } + } + } +} + +INSTANTIATE_TEST_CASE_P(C, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_c)); + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P(SSSE3, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_ssse3)); +#endif // HAVE_SSSE3 + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_neon)); +#endif // HAVE_NEON + +} // namespace libvpx_test diff --git a/libs/libvpx/test/vp9_skip_loopfilter_test.cc b/libs/libvpx/test/vp9_skip_loopfilter_test.cc index e847bbddf3..d41a784a22 100644 --- a/libs/libvpx/test/vp9_skip_loopfilter_test.cc +++ b/libs/libvpx/test/vp9_skip_loopfilter_test.cc @@ -85,8 +85,8 @@ class SkipLoopFilterTest { // TODO(fgalligan): Move the MD5 testing code into another class. void OpenMd5File(const std::string &md5_file_name) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name); - ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: " - << md5_file_name; + ASSERT_TRUE(md5_file_ != NULL) + << "MD5 file open failed. Filename: " << md5_file_name; } // Reads the next line of the MD5 file. diff --git a/libs/libvpx/test/vp9_subtract_test.cc b/libs/libvpx/test/vp9_subtract_test.cc index 19ed304315..62845ad615 100644 --- a/libs/libvpx/test/vp9_subtract_test.cc +++ b/libs/libvpx/test/vp9_subtract_test.cc @@ -101,4 +101,9 @@ INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_msa)); #endif +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_mmi)); +#endif + } // namespace vp9 diff --git a/libs/libvpx/test/vp9_thread_test.cc b/libs/libvpx/test/vp9_thread_test.cc index 3e3fd25acb..576f5e906b 100644 --- a/libs/libvpx/test/vp9_thread_test.cc +++ b/libs/libvpx/test/vp9_thread_test.cc @@ -187,8 +187,8 @@ void DecodeFiles(const FileList files[]) { for (const FileList *iter = files; iter->name != NULL; ++iter) { SCOPED_TRACE(iter->name); for (int t = 1; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) << "threads = " - << t; + EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) + << "threads = " << t; } } } diff --git a/libs/libvpx/test/vpx_scale_test.cc b/libs/libvpx/test/vpx_scale_test.cc index 81773fe5b6..ac75dceb23 100644 --- a/libs/libvpx/test/vpx_scale_test.cc +++ b/libs/libvpx/test/vpx_scale_test.cc @@ -14,149 +14,17 @@ #include "./vpx_scale_rtcd.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" +#include "test/vpx_scale_test.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" #include "vpx_scale/yv12config.h" -namespace { +namespace libvpx_test { typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf); typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf, YV12_BUFFER_CONFIG *dst_ybf); -class VpxScaleBase { - public: - virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); } - - void ResetImage(int width, int height) { - width_ = width; - height_ = height; - memset(&img_, 0, sizeof(img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_, - VP8BORDERINPIXELS)); - memset(img_.buffer_alloc, kBufFiller, img_.frame_size); - FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, - img_.y_stride); - FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); - FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); - - memset(&ref_img_, 0, sizeof(ref_img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_, - VP8BORDERINPIXELS)); - memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size); - - memset(&cpy_img_, 0, sizeof(cpy_img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_, - VP8BORDERINPIXELS)); - memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size); - ReferenceCopyFrame(); - } - - void DeallocImage() { - vp8_yv12_de_alloc_frame_buffer(&img_); - vp8_yv12_de_alloc_frame_buffer(&ref_img_); - vp8_yv12_de_alloc_frame_buffer(&cpy_img_); - } - - protected: - static const int kBufFiller = 123; - static const int kBufMax = kBufFiller - 1; - - static void FillPlane(uint8_t *buf, int width, int height, int stride) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - buf[x + (y * stride)] = (x + (width * y)) % kBufMax; - } - } - } - - static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, - int width, int height, int stride, int padding) { - // Copy the outermost visible pixel to a distance of at least 'padding.' - // The buffers are allocated such that there may be excess space outside the - // padding. As long as the minimum amount of padding is achieved it is not - // necessary to fill this space as well. - uint8_t *left = buf - padding; - uint8_t *right = buf + crop_width; - const int right_extend = padding + (width - crop_width); - const int bottom_extend = padding + (height - crop_height); - - // Fill the border pixels from the nearest image pixel. - for (int y = 0; y < crop_height; ++y) { - memset(left, left[padding], padding); - memset(right, right[-1], right_extend); - left += stride; - right += stride; - } - - left = buf - padding; - uint8_t *top = left - (stride * padding); - // The buffer does not always extend as far as the stride. - // Equivalent to padding + width + padding. - const int extend_width = padding + crop_width + right_extend; - - // The first row was already extended to the left and right. Copy it up. - for (int y = 0; y < padding; ++y) { - memcpy(top, left, extend_width); - top += stride; - } - - uint8_t *bottom = left + (crop_height * stride); - for (int y = 0; y < bottom_extend; ++y) { - memcpy(bottom, left + (crop_height - 1) * stride, extend_width); - bottom += stride; - } - } - - void ReferenceExtendBorder() { - ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width, - ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height, - ref_img_.y_stride, ref_img_.border); - ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width, - ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, - ref_img_.uv_stride, ref_img_.border / 2); - ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width, - ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, - ref_img_.uv_stride, ref_img_.border / 2); - } - - void ReferenceCopyFrame() { - // Copy img_ to ref_img_ and extend frame borders. This will be used for - // verifying extend_fn_ as well as copy_frame_fn_. - EXPECT_EQ(ref_img_.frame_size, img_.frame_size); - for (int y = 0; y < img_.y_crop_height; ++y) { - for (int x = 0; x < img_.y_crop_width; ++x) { - ref_img_.y_buffer[x + y * ref_img_.y_stride] = - img_.y_buffer[x + y * img_.y_stride]; - } - } - - for (int y = 0; y < img_.uv_crop_height; ++y) { - for (int x = 0; x < img_.uv_crop_width; ++x) { - ref_img_.u_buffer[x + y * ref_img_.uv_stride] = - img_.u_buffer[x + y * img_.uv_stride]; - ref_img_.v_buffer[x + y * ref_img_.uv_stride] = - img_.v_buffer[x + y * img_.uv_stride]; - } - } - - ReferenceExtendBorder(); - } - - void CompareImages(const YV12_BUFFER_CONFIG actual) { - EXPECT_EQ(ref_img_.frame_size, actual.frame_size); - EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, - ref_img_.frame_size)); - } - - YV12_BUFFER_CONFIG img_; - YV12_BUFFER_CONFIG ref_img_; - YV12_BUFFER_CONFIG cpy_img_; - int width_; - int height_; -}; - class ExtendBorderTest : public VpxScaleBase, public ::testing::TestWithParam { @@ -178,11 +46,11 @@ class ExtendBorderTest static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 }; for (int h = 0; h < kNumSizesToTest; ++h) { for (int w = 0; w < kNumSizesToTest; ++w) { - ResetImage(kSizesToTest[w], kSizesToTest[h]); + ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); + ReferenceCopyFrame(); ExtendBorder(); - ReferenceExtendBorder(); CompareImages(img_); - DeallocImage(); + DeallocImages(); } } } @@ -204,7 +72,7 @@ class CopyFrameTest : public VpxScaleBase, virtual void SetUp() { copy_frame_fn_ = GetParam(); } void CopyFrame() { - ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_)); + ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_)); } void RunTest() { @@ -217,11 +85,11 @@ class CopyFrameTest : public VpxScaleBase, static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 }; for (int h = 0; h < kNumSizesToTest; ++h) { for (int w = 0; w < kNumSizesToTest; ++w) { - ResetImage(kSizesToTest[w], kSizesToTest[h]); + ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); ReferenceCopyFrame(); CopyFrame(); - CompareImages(cpy_img_); - DeallocImage(); + CompareImages(dst_img_); + DeallocImages(); } } } @@ -233,4 +101,5 @@ TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } INSTANTIATE_TEST_CASE_P(C, CopyFrameTest, ::testing::Values(vp8_yv12_copy_frame_c)); -} // namespace + +} // namespace libvpx_test diff --git a/libs/libvpx/test/vpx_scale_test.h b/libs/libvpx/test/vpx_scale_test.h new file mode 100644 index 0000000000..dcbd02b91f --- /dev/null +++ b/libs/libvpx/test/vpx_scale_test.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_VPX_SCALE_TEST_H_ +#define TEST_VPX_SCALE_TEST_H_ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" + +using libvpx_test::ACMRandom; + +namespace libvpx_test { + +class VpxScaleBase { + public: + virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); } + + void ResetImage(YV12_BUFFER_CONFIG *const img, const int width, + const int height) { + memset(img, 0, sizeof(*img)); + ASSERT_EQ( + 0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS)); + memset(img->buffer_alloc, kBufFiller, img->frame_size); + } + + void ResetImages(const int width, const int height) { + ResetImage(&img_, width, height); + ResetImage(&ref_img_, width, height); + ResetImage(&dst_img_, width, height); + + FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + } + + void ResetScaleImage(YV12_BUFFER_CONFIG *const img, const int width, + const int height) { + memset(img, 0, sizeof(*img)); +#if CONFIG_VP9_HIGHBITDEPTH + ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, 0, + VP9_ENC_BORDER_IN_PIXELS, 0)); +#else + ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, + VP9_ENC_BORDER_IN_PIXELS, 0)); +#endif + memset(img->buffer_alloc, kBufFiller, img->frame_size); + } + + void ResetScaleImages(const int src_width, const int src_height, + const int dst_width, const int dst_height) { + ResetScaleImage(&img_, src_width, src_height); + ResetScaleImage(&ref_img_, dst_width, dst_height); + ResetScaleImage(&dst_img_, dst_width, dst_height); + FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + } + + void DeallocImages() { + vp8_yv12_de_alloc_frame_buffer(&img_); + vp8_yv12_de_alloc_frame_buffer(&ref_img_); + vp8_yv12_de_alloc_frame_buffer(&dst_img_); + } + + void DeallocScaleImages() { + vpx_free_frame_buffer(&img_); + vpx_free_frame_buffer(&ref_img_); + vpx_free_frame_buffer(&dst_img_); + } + + protected: + static const int kBufFiller = 123; + static const int kBufMax = kBufFiller - 1; + + static void FillPlane(uint8_t *const buf, const int width, const int height, + const int stride) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = (x + (width * y)) % kBufMax; + } + } + } + + static void FillPlaneExtreme(uint8_t *const buf, const int width, + const int height, const int stride) { + ACMRandom rnd; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0; + } + } + } + + static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, + int width, int height, int stride, int padding) { + // Copy the outermost visible pixel to a distance of at least 'padding.' + // The buffers are allocated such that there may be excess space outside the + // padding. As long as the minimum amount of padding is achieved it is not + // necessary to fill this space as well. + uint8_t *left = buf - padding; + uint8_t *right = buf + crop_width; + const int right_extend = padding + (width - crop_width); + const int bottom_extend = padding + (height - crop_height); + + // Fill the border pixels from the nearest image pixel. + for (int y = 0; y < crop_height; ++y) { + memset(left, left[padding], padding); + memset(right, right[-1], right_extend); + left += stride; + right += stride; + } + + left = buf - padding; + uint8_t *top = left - (stride * padding); + // The buffer does not always extend as far as the stride. + // Equivalent to padding + width + padding. + const int extend_width = padding + crop_width + right_extend; + + // The first row was already extended to the left and right. Copy it up. + for (int y = 0; y < padding; ++y) { + memcpy(top, left, extend_width); + top += stride; + } + + uint8_t *bottom = left + (crop_height * stride); + for (int y = 0; y < bottom_extend; ++y) { + memcpy(bottom, left + (crop_height - 1) * stride, extend_width); + bottom += stride; + } + } + + void ReferenceExtendBorder() { + ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width, + ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height, + ref_img_.y_stride, ref_img_.border); + ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width, + ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, ref_img_.border / 2); + ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width, + ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, ref_img_.border / 2); + } + + void ReferenceCopyFrame() { + // Copy img_ to ref_img_ and extend frame borders. This will be used for + // verifying extend_fn_ as well as copy_frame_fn_. + EXPECT_EQ(ref_img_.frame_size, img_.frame_size); + for (int y = 0; y < img_.y_crop_height; ++y) { + for (int x = 0; x < img_.y_crop_width; ++x) { + ref_img_.y_buffer[x + y * ref_img_.y_stride] = + img_.y_buffer[x + y * img_.y_stride]; + } + } + + for (int y = 0; y < img_.uv_crop_height; ++y) { + for (int x = 0; x < img_.uv_crop_width; ++x) { + ref_img_.u_buffer[x + y * ref_img_.uv_stride] = + img_.u_buffer[x + y * img_.uv_stride]; + ref_img_.v_buffer[x + y * ref_img_.uv_stride] = + img_.v_buffer[x + y * img_.uv_stride]; + } + } + + ReferenceExtendBorder(); + } + + void CompareImages(const YV12_BUFFER_CONFIG actual) { + EXPECT_EQ(ref_img_.frame_size, actual.frame_size); + EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, + ref_img_.frame_size)); + } + + YV12_BUFFER_CONFIG img_; + YV12_BUFFER_CONFIG ref_img_; + YV12_BUFFER_CONFIG dst_img_; +}; + +} // namespace libvpx_test + +#endif // TEST_VPX_SCALE_TEST_H_ diff --git a/libs/libvpx/test/vpx_temporal_svc_encoder.sh b/libs/libvpx/test/vpx_temporal_svc_encoder.sh index fcc8cb4ff4..56a7902f4f 100755 --- a/libs/libvpx/test/vpx_temporal_svc_encoder.sh +++ b/libs/libvpx/test/vpx_temporal_svc_encoder.sh @@ -40,6 +40,8 @@ vpx_tsvc_encoder() { local timebase_den="1000" local speed="6" local frame_drop_thresh="30" + local max_threads="4" + local error_resilient="1" shift 2 @@ -48,11 +50,22 @@ vpx_tsvc_encoder() { return 1 fi - eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \ - "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \ - "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \ - "$@" \ - ${devnull} + # TODO(tomfinegan): Verify file output for all thread runs. + for threads in $(seq $max_threads); do + if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ + "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" ${devnull} + else + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ + "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" "8" ${devnull} + fi + done } # Confirms that all expected output files exist given the output file name @@ -72,193 +85,217 @@ files_exist() { vpx_tsvc_encoder_vp8_mode_0() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 0 200 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_0" + vpx_tsvc_encoder vp8 "${output_basename}" 0 200 || return 1 # Mode 0 produces 1 stream - files_exist "${FUNCNAME}" 1 || return 1 + files_exist "${output_basename}" 1 || return 1 fi } vpx_tsvc_encoder_vp8_mode_1() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 1 200 400 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_1" + vpx_tsvc_encoder vp8 "${output_basename}" 1 200 400 || return 1 # Mode 1 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp8_mode_2() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 2 200 400 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_2" + vpx_tsvc_encoder vp8 "${output_basename}" 2 200 400 || return 1 # Mode 2 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp8_mode_3() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 3 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_3" + vpx_tsvc_encoder vp8 "${output_basename}" 3 200 400 600 || return 1 # Mode 3 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_4() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 4 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_4" + vpx_tsvc_encoder vp8 "${output_basename}" 4 200 400 600 || return 1 # Mode 4 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_5() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 5 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_5" + vpx_tsvc_encoder vp8 "${output_basename}" 5 200 400 600 || return 1 # Mode 5 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_6() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 6 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_6" + vpx_tsvc_encoder vp8 "${output_basename}" 6 200 400 600 || return 1 # Mode 6 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_7() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_7" + vpx_tsvc_encoder vp8 "${output_basename}" 7 200 400 600 800 1000 || return 1 # Mode 7 produces 5 streams - files_exist "${FUNCNAME}" 5 || return 1 + files_exist "${output_basename}" 5 || return 1 fi } vpx_tsvc_encoder_vp8_mode_8() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 8 200 400 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_8" + vpx_tsvc_encoder vp8 "${output_basename}" 8 200 400 || return 1 # Mode 8 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp8_mode_9() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 9 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_9" + vpx_tsvc_encoder vp8 "${output_basename}" 9 200 400 600 || return 1 # Mode 9 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_10() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 10 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_10" + vpx_tsvc_encoder vp8 "${output_basename}" 10 200 400 600 || return 1 # Mode 10 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_11() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 11 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp8_mode_11" + vpx_tsvc_encoder vp8 "${output_basename}" 11 200 400 600 || return 1 # Mode 11 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_0() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 0 200 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_0" + vpx_tsvc_encoder vp9 "${output_basename}" 0 200 || return 1 # Mode 0 produces 1 stream - files_exist "${FUNCNAME}" 1 || return 1 + files_exist "${output_basename}" 1 || return 1 fi } vpx_tsvc_encoder_vp9_mode_1() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 1 200 400 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_1" + vpx_tsvc_encoder vp9 "${output_basename}" 1 200 400 || return 1 # Mode 1 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp9_mode_2() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 2 200 400 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_2" + vpx_tsvc_encoder vp9 "${output_basename}" 2 200 400 || return 1 # Mode 2 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp9_mode_3() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 3 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_3" + vpx_tsvc_encoder vp9 "${output_basename}" 3 200 400 600 || return 1 # Mode 3 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_4() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 4 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_4" + vpx_tsvc_encoder vp9 "${output_basename}" 4 200 400 600 || return 1 # Mode 4 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_5() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 5 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_5" + vpx_tsvc_encoder vp9 "${output_basename}" 5 200 400 600 || return 1 # Mode 5 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_6() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 6 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_6" + vpx_tsvc_encoder vp9 "${output_basename}" 6 200 400 600 || return 1 # Mode 6 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_7() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_7" + vpx_tsvc_encoder vp9 "${output_basename}" 7 200 400 600 800 1000 || return 1 # Mode 7 produces 5 streams - files_exist "${FUNCNAME}" 5 || return 1 + files_exist "${output_basename}" 5 || return 1 fi } vpx_tsvc_encoder_vp9_mode_8() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 8 200 400 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_8" + vpx_tsvc_encoder vp9 "${output_basename}" 8 200 400 || return 1 # Mode 8 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp9_mode_9() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 9 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_9" + vpx_tsvc_encoder vp9 "${output_basename}" 9 200 400 600 || return 1 # Mode 9 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_10() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 10 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_10" + vpx_tsvc_encoder vp9 "${output_basename}" 10 200 400 600 || return 1 # Mode 10 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_11() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 11 200 400 600 || return 1 + local readonly output_basename="vpx_tsvc_encoder_vp9_mode_11" + vpx_tsvc_encoder vp9 "${output_basename}" 11 200 400 600 || return 1 # Mode 11 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } diff --git a/libs/libvpx/test/vpxenc.sh b/libs/libvpx/test/vpxenc.sh index e8994992ae..0c160dafc0 100755 --- a/libs/libvpx/test/vpxenc.sh +++ b/libs/libvpx/test/vpxenc.sh @@ -90,6 +90,15 @@ vpxenc_rt_params() { --undershoot-pct=50" } +# Forces --passes to 1 with CONFIG_REALTIME_ONLY. +vpxenc_passes_param() { + if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ]; then + echo "--passes=1" + else + echo "--passes=2" + fi +} + # Wrapper function for running vpxenc with pipe input. Requires that # LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the # input file path and shifted away. All remaining parameters are passed through @@ -218,9 +227,11 @@ vpxenc_vp8_ivf_piped_input() { vpxenc_vp9_ivf() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ + "${passes}" \ --ivf \ --output="${output}" @@ -235,9 +246,11 @@ vpxenc_vp9_webm() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ + "${passes}" \ --output="${output}" if [ ! -e "${output}" ]; then @@ -339,11 +352,13 @@ vpxenc_vp9_webm_2pass() { vpxenc_vp9_ivf_lossless() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ --output="${output}" \ + "${passes}" \ --lossless=1 if [ ! -e "${output}" ]; then @@ -356,11 +371,13 @@ vpxenc_vp9_ivf_lossless() { vpxenc_vp9_ivf_minq0_maxq0() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ --output="${output}" \ + "${passes}" \ --min-q=0 \ --max-q=0 @@ -377,12 +394,13 @@ vpxenc_vp9_webm_lag10_frames20() { local readonly lag_total_frames=20 local readonly lag_frames=10 local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm" + local readonly passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${lag_total_frames}" \ --lag-in-frames="${lag_frames}" \ --output="${output}" \ - --passes=2 \ + "${passes}" \ --auto-alt-ref=1 if [ ! -e "${output}" ]; then @@ -397,9 +415,11 @@ vpxenc_vp9_webm_non_square_par() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm" + local readonly passes=$(vpxenc_passes_param) vpxenc $(y4m_input_non_square_par) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ + "${passes}" \ --output="${output}" if [ ! -e "${output}" ]; then @@ -412,18 +432,21 @@ vpxenc_vp9_webm_non_square_par() { vpxenc_tests="vpxenc_vp8_ivf vpxenc_vp8_webm vpxenc_vp8_webm_rt - vpxenc_vp8_webm_2pass - vpxenc_vp8_webm_lag10_frames20 vpxenc_vp8_ivf_piped_input vpxenc_vp9_ivf vpxenc_vp9_webm vpxenc_vp9_webm_rt vpxenc_vp9_webm_rt_multithread_tiled vpxenc_vp9_webm_rt_multithread_tiled_frameparallel - vpxenc_vp9_webm_2pass vpxenc_vp9_ivf_lossless vpxenc_vp9_ivf_minq0_maxq0 vpxenc_vp9_webm_lag10_frames20 vpxenc_vp9_webm_non_square_par" +if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then + vpxenc_tests="$vpxenc_tests + vpxenc_vp8_webm_2pass + vpxenc_vp8_webm_lag10_frames20 + vpxenc_vp9_webm_2pass" +fi run_tests vpxenc_verify_environment "${vpxenc_tests}" diff --git a/libs/libvpx/test/webm_video_source.h b/libs/libvpx/test/webm_video_source.h index 53713618ee..09c007a3f3 100644 --- a/libs/libvpx/test/webm_video_source.h +++ b/libs/libvpx/test/webm_video_source.h @@ -40,8 +40,8 @@ class WebMVideoSource : public CompressedVideoSource { virtual void Begin() { vpx_ctx_->file = OpenTestDataFile(file_name_); - ASSERT_TRUE(vpx_ctx_->file != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_TRUE(vpx_ctx_->file != NULL) + << "Input file open failed. Filename: " << file_name_; ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM"; diff --git a/libs/libvpx/test/y4m_video_source.h b/libs/libvpx/test/y4m_video_source.h index 2682ddde3d..1301f69703 100644 --- a/libs/libvpx/test/y4m_video_source.h +++ b/libs/libvpx/test/y4m_video_source.h @@ -34,8 +34,8 @@ class Y4mVideoSource : public VideoSource { virtual void OpenSource() { CloseSource(); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; } virtual void ReadSourceToStart() { diff --git a/libs/libvpx/test/yuv_video_source.h b/libs/libvpx/test/yuv_video_source.h index 71ad2ab9a6..aee6b2ffbb 100644 --- a/libs/libvpx/test/yuv_video_source.h +++ b/libs/libvpx/test/yuv_video_source.h @@ -43,8 +43,8 @@ class YUVVideoSource : public VideoSource { virtual void Begin() { if (input_file_) fclose(input_file_); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_TRUE(input_file_ != NULL) + << "Input file open failed. Filename: " << file_name_; if (start_) { fseek(input_file_, static_cast(raw_size_) * start_, SEEK_SET); } diff --git a/libs/libvpx/third_party/googletest/README.libvpx b/libs/libvpx/third_party/googletest/README.libvpx index 0e3b8b9377..2cd6910b41 100644 --- a/libs/libvpx/third_party/googletest/README.libvpx +++ b/libs/libvpx/third_party/googletest/README.libvpx @@ -1,7 +1,7 @@ -URL: http://code.google.com/p/googletest/ -Version: 1.7.0 +URL: https://github.com/google/googletest +Version: 1.8.0 License: BSD -License File: COPYING +License File: LICENSE Description: Google's framework for writing C++ tests on a variety of platforms @@ -12,10 +12,13 @@ failures, various options for running the tests, and XML test report generation. Local Modifications: -- Removed unused declarations of kPathSeparatorString to have warning - free build. -- Added GTEST_ATTRIBUTE_UNUSED_ to test registering dummies in TEST_P - and INSTANTIATE_TEST_CASE_P to remove warnings about unused variables - under GCC 5. -- Only define g_in_fast_death_test_child for non-Windows builds; quiets an - unused variable warning. +- Remove everything but: + googletest-release-1.8.0/googletest/ + CHANGES + CONTRIBUTORS + include + LICENSE + README.md + src +- Suppress unsigned overflow instrumentation in the LCG + https://github.com/google/googletest/pull/1066 diff --git a/libs/libvpx/third_party/googletest/src/README b/libs/libvpx/third_party/googletest/src/README deleted file mode 100644 index 26f35a8479..0000000000 --- a/libs/libvpx/third_party/googletest/src/README +++ /dev/null @@ -1,435 +0,0 @@ -Google C++ Testing Framework -============================ - -http://code.google.com/p/googletest/ - -Overview --------- - -Google's framework for writing C++ tests on a variety of platforms -(Linux, Mac OS X, Windows, Windows CE, Symbian, etc). Based on the -xUnit architecture. Supports automatic test discovery, a rich set of -assertions, user-defined assertions, death tests, fatal and non-fatal -failures, various options for running the tests, and XML test report -generation. - -Please see the project page above for more information as well as the -mailing list for questions, discussions, and development. There is -also an IRC channel on OFTC (irc.oftc.net) #gtest available. Please -join us! - -Requirements for End Users --------------------------- - -Google Test is designed to have fairly minimal requirements to build -and use with your projects, but there are some. Currently, we support -Linux, Windows, Mac OS X, and Cygwin. We will also make our best -effort to support other platforms (e.g. Solaris, AIX, and z/OS). -However, since core members of the Google Test project have no access -to these platforms, Google Test may have outstanding issues there. If -you notice any problems on your platform, please notify -googletestframework@googlegroups.com. Patches for fixing them are -even more welcome! - -### Linux Requirements ### - -These are the base requirements to build and use Google Test from a source -package (as described below): - * GNU-compatible Make or gmake - * POSIX-standard shell - * POSIX(-2) Regular Expressions (regex.h) - * A C++98-standard-compliant compiler - -### Windows Requirements ### - - * Microsoft Visual C++ 7.1 or newer - -### Cygwin Requirements ### - - * Cygwin 1.5.25-14 or newer - -### Mac OS X Requirements ### - - * Mac OS X 10.4 Tiger or newer - * Developer Tools Installed - -Also, you'll need CMake 2.6.4 or higher if you want to build the -samples using the provided CMake script, regardless of the platform. - -Requirements for Contributors ------------------------------ - -We welcome patches. If you plan to contribute a patch, you need to -build Google Test and its own tests from an SVN checkout (described -below), which has further requirements: - - * Python version 2.3 or newer (for running some of the tests and - re-generating certain source files from templates) - * CMake 2.6.4 or newer - -Getting the Source ------------------- - -There are two primary ways of getting Google Test's source code: you -can download a stable source release in your preferred archive format, -or directly check out the source from our Subversion (SVN) repositary. -The SVN checkout requires a few extra steps and some extra software -packages on your system, but lets you track the latest development and -make patches much more easily, so we highly encourage it. - -### Source Package ### - -Google Test is released in versioned source packages which can be -downloaded from the download page [1]. Several different archive -formats are provided, but the only difference is the tools used to -manipulate them, and the size of the resulting file. Download -whichever you are most comfortable with. - - [1] http://code.google.com/p/googletest/downloads/list - -Once the package is downloaded, expand it using whichever tools you -prefer for that type. This will result in a new directory with the -name "gtest-X.Y.Z" which contains all of the source code. Here are -some examples on Linux: - - tar -xvzf gtest-X.Y.Z.tar.gz - tar -xvjf gtest-X.Y.Z.tar.bz2 - unzip gtest-X.Y.Z.zip - -### SVN Checkout ### - -To check out the main branch (also known as the "trunk") of Google -Test, run the following Subversion command: - - svn checkout http://googletest.googlecode.com/svn/trunk/ gtest-svn - -Setting up the Build --------------------- - -To build Google Test and your tests that use it, you need to tell your -build system where to find its headers and source files. The exact -way to do it depends on which build system you use, and is usually -straightforward. - -### Generic Build Instructions ### - -Suppose you put Google Test in directory ${GTEST_DIR}. To build it, -create a library build target (or a project as called by Visual Studio -and Xcode) to compile - - ${GTEST_DIR}/src/gtest-all.cc - -with ${GTEST_DIR}/include in the system header search path and ${GTEST_DIR} -in the normal header search path. Assuming a Linux-like system and gcc, -something like the following will do: - - g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \ - -pthread -c ${GTEST_DIR}/src/gtest-all.cc - ar -rv libgtest.a gtest-all.o - -(We need -pthread as Google Test uses threads.) - -Next, you should compile your test source file with -${GTEST_DIR}/include in the system header search path, and link it -with gtest and any other necessary libraries: - - g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \ - -o your_test - -As an example, the make/ directory contains a Makefile that you can -use to build Google Test on systems where GNU make is available -(e.g. Linux, Mac OS X, and Cygwin). It doesn't try to build Google -Test's own tests. Instead, it just builds the Google Test library and -a sample test. You can use it as a starting point for your own build -script. - -If the default settings are correct for your environment, the -following commands should succeed: - - cd ${GTEST_DIR}/make - make - ./sample1_unittest - -If you see errors, try to tweak the contents of make/Makefile to make -them go away. There are instructions in make/Makefile on how to do -it. - -### Using CMake ### - -Google Test comes with a CMake build script (CMakeLists.txt) that can -be used on a wide range of platforms ("C" stands for cross-platofrm.). -If you don't have CMake installed already, you can download it for -free from http://www.cmake.org/. - -CMake works by generating native makefiles or build projects that can -be used in the compiler environment of your choice. The typical -workflow starts with: - - mkdir mybuild # Create a directory to hold the build output. - cd mybuild - cmake ${GTEST_DIR} # Generate native build scripts. - -If you want to build Google Test's samples, you should replace the -last command with - - cmake -Dgtest_build_samples=ON ${GTEST_DIR} - -If you are on a *nix system, you should now see a Makefile in the -current directory. Just type 'make' to build gtest. - -If you use Windows and have Vistual Studio installed, a gtest.sln file -and several .vcproj files will be created. You can then build them -using Visual Studio. - -On Mac OS X with Xcode installed, a .xcodeproj file will be generated. - -### Legacy Build Scripts ### - -Before settling on CMake, we have been providing hand-maintained build -projects/scripts for Visual Studio, Xcode, and Autotools. While we -continue to provide them for convenience, they are not actively -maintained any more. We highly recommend that you follow the -instructions in the previous two sections to integrate Google Test -with your existing build system. - -If you still need to use the legacy build scripts, here's how: - -The msvc\ folder contains two solutions with Visual C++ projects. -Open the gtest.sln or gtest-md.sln file using Visual Studio, and you -are ready to build Google Test the same way you build any Visual -Studio project. Files that have names ending with -md use DLL -versions of Microsoft runtime libraries (the /MD or the /MDd compiler -option). Files without that suffix use static versions of the runtime -libraries (the /MT or the /MTd option). Please note that one must use -the same option to compile both gtest and the test code. If you use -Visual Studio 2005 or above, we recommend the -md version as /MD is -the default for new projects in these versions of Visual Studio. - -On Mac OS X, open the gtest.xcodeproj in the xcode/ folder using -Xcode. Build the "gtest" target. The universal binary framework will -end up in your selected build directory (selected in the Xcode -"Preferences..." -> "Building" pane and defaults to xcode/build). -Alternatively, at the command line, enter: - - xcodebuild - -This will build the "Release" configuration of gtest.framework in your -default build location. See the "xcodebuild" man page for more -information about building different configurations and building in -different locations. - -If you wish to use the Google Test Xcode project with Xcode 4.x and -above, you need to either: - * update the SDK configuration options in xcode/Config/General.xconfig. - Comment options SDKROOT, MACOS_DEPLOYMENT_TARGET, and GCC_VERSION. If - you choose this route you lose the ability to target earlier versions - of MacOS X. - * Install an SDK for an earlier version. This doesn't appear to be - supported by Apple, but has been reported to work - (http://stackoverflow.com/questions/5378518). - -Tweaking Google Test --------------------- - -Google Test can be used in diverse environments. The default -configuration may not work (or may not work well) out of the box in -some environments. However, you can easily tweak Google Test by -defining control macros on the compiler command line. Generally, -these macros are named like GTEST_XYZ and you define them to either 1 -or 0 to enable or disable a certain feature. - -We list the most frequently used macros below. For a complete list, -see file include/gtest/internal/gtest-port.h. - -### Choosing a TR1 Tuple Library ### - -Some Google Test features require the C++ Technical Report 1 (TR1) -tuple library, which is not yet available with all compilers. The -good news is that Google Test implements a subset of TR1 tuple that's -enough for its own need, and will automatically use this when the -compiler doesn't provide TR1 tuple. - -Usually you don't need to care about which tuple library Google Test -uses. However, if your project already uses TR1 tuple, you need to -tell Google Test to use the same TR1 tuple library the rest of your -project uses, or the two tuple implementations will clash. To do -that, add - - -DGTEST_USE_OWN_TR1_TUPLE=0 - -to the compiler flags while compiling Google Test and your tests. If -you want to force Google Test to use its own tuple library, just add - - -DGTEST_USE_OWN_TR1_TUPLE=1 - -to the compiler flags instead. - -If you don't want Google Test to use tuple at all, add - - -DGTEST_HAS_TR1_TUPLE=0 - -and all features using tuple will be disabled. - -### Multi-threaded Tests ### - -Google Test is thread-safe where the pthread library is available. -After #include "gtest/gtest.h", you can check the GTEST_IS_THREADSAFE -macro to see whether this is the case (yes if the macro is #defined to -1, no if it's undefined.). - -If Google Test doesn't correctly detect whether pthread is available -in your environment, you can force it with - - -DGTEST_HAS_PTHREAD=1 - -or - - -DGTEST_HAS_PTHREAD=0 - -When Google Test uses pthread, you may need to add flags to your -compiler and/or linker to select the pthread library, or you'll get -link errors. If you use the CMake script or the deprecated Autotools -script, this is taken care of for you. If you use your own build -script, you'll need to read your compiler and linker's manual to -figure out what flags to add. - -### As a Shared Library (DLL) ### - -Google Test is compact, so most users can build and link it as a -static library for the simplicity. You can choose to use Google Test -as a shared library (known as a DLL on Windows) if you prefer. - -To compile *gtest* as a shared library, add - - -DGTEST_CREATE_SHARED_LIBRARY=1 - -to the compiler flags. You'll also need to tell the linker to produce -a shared library instead - consult your linker's manual for how to do -it. - -To compile your *tests* that use the gtest shared library, add - - -DGTEST_LINKED_AS_SHARED_LIBRARY=1 - -to the compiler flags. - -Note: while the above steps aren't technically necessary today when -using some compilers (e.g. GCC), they may become necessary in the -future, if we decide to improve the speed of loading the library (see -http://gcc.gnu.org/wiki/Visibility for details). Therefore you are -recommended to always add the above flags when using Google Test as a -shared library. Otherwise a future release of Google Test may break -your build script. - -### Avoiding Macro Name Clashes ### - -In C++, macros don't obey namespaces. Therefore two libraries that -both define a macro of the same name will clash if you #include both -definitions. In case a Google Test macro clashes with another -library, you can force Google Test to rename its macro to avoid the -conflict. - -Specifically, if both Google Test and some other code define macro -FOO, you can add - - -DGTEST_DONT_DEFINE_FOO=1 - -to the compiler flags to tell Google Test to change the macro's name -from FOO to GTEST_FOO. Currently FOO can be FAIL, SUCCEED, or TEST. -For example, with -DGTEST_DONT_DEFINE_TEST=1, you'll need to write - - GTEST_TEST(SomeTest, DoesThis) { ... } - -instead of - - TEST(SomeTest, DoesThis) { ... } - -in order to define a test. - -Upgrating from an Earlier Version ---------------------------------- - -We strive to keep Google Test releases backward compatible. -Sometimes, though, we have to make some breaking changes for the -users' long-term benefits. This section describes what you'll need to -do if you are upgrading from an earlier version of Google Test. - -### Upgrading from 1.3.0 or Earlier ### - -You may need to explicitly enable or disable Google Test's own TR1 -tuple library. See the instructions in section "Choosing a TR1 Tuple -Library". - -### Upgrading from 1.4.0 or Earlier ### - -The Autotools build script (configure + make) is no longer officially -supportted. You are encouraged to migrate to your own build system or -use CMake. If you still need to use Autotools, you can find -instructions in the README file from Google Test 1.4.0. - -On platforms where the pthread library is available, Google Test uses -it in order to be thread-safe. See the "Multi-threaded Tests" section -for what this means to your build script. - -If you use Microsoft Visual C++ 7.1 with exceptions disabled, Google -Test will no longer compile. This should affect very few people, as a -large portion of STL (including ) doesn't compile in this mode -anyway. We decided to stop supporting it in order to greatly simplify -Google Test's implementation. - -Developing Google Test ----------------------- - -This section discusses how to make your own changes to Google Test. - -### Testing Google Test Itself ### - -To make sure your changes work as intended and don't break existing -functionality, you'll want to compile and run Google Test's own tests. -For that you can use CMake: - - mkdir mybuild - cd mybuild - cmake -Dgtest_build_tests=ON ${GTEST_DIR} - -Make sure you have Python installed, as some of Google Test's tests -are written in Python. If the cmake command complains about not being -able to find Python ("Could NOT find PythonInterp (missing: -PYTHON_EXECUTABLE)"), try telling it explicitly where your Python -executable can be found: - - cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR} - -Next, you can build Google Test and all of its own tests. On *nix, -this is usually done by 'make'. To run the tests, do - - make test - -All tests should pass. - -### Regenerating Source Files ### - -Some of Google Test's source files are generated from templates (not -in the C++ sense) using a script. A template file is named FOO.pump, -where FOO is the name of the file it will generate. For example, the -file include/gtest/internal/gtest-type-util.h.pump is used to generate -gtest-type-util.h in the same directory. - -Normally you don't need to worry about regenerating the source files, -unless you need to modify them. In that case, you should modify the -corresponding .pump files instead and run the pump.py Python script to -regenerate them. You can find pump.py in the scripts/ directory. -Read the Pump manual [2] for how to use it. - - [2] http://code.google.com/p/googletest/wiki/PumpManual - -### Contributing a Patch ### - -We welcome patches. Please read the Google Test developer's guide [3] -for how you can contribute. In particular, make sure you have signed -the Contributor License Agreement, or we won't be able to accept the -patch. - - [3] http://code.google.com/p/googletest/wiki/GoogleTestDevGuide - -Happy testing! diff --git a/libs/libvpx/third_party/googletest/src/README.md b/libs/libvpx/third_party/googletest/src/README.md new file mode 100644 index 0000000000..edd4408054 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/README.md @@ -0,0 +1,280 @@ + +### Generic Build Instructions ### + +#### Setup #### + +To build Google Test and your tests that use it, you need to tell your +build system where to find its headers and source files. The exact +way to do it depends on which build system you use, and is usually +straightforward. + +#### Build #### + +Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, +create a library build target (or a project as called by Visual Studio +and Xcode) to compile + + ${GTEST_DIR}/src/gtest-all.cc + +with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}` +in the normal header search path. Assuming a Linux-like system and gcc, +something like the following will do: + + g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \ + -pthread -c ${GTEST_DIR}/src/gtest-all.cc + ar -rv libgtest.a gtest-all.o + +(We need `-pthread` as Google Test uses threads.) + +Next, you should compile your test source file with +`${GTEST_DIR}/include` in the system header search path, and link it +with gtest and any other necessary libraries: + + g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \ + -o your_test + +As an example, the make/ directory contains a Makefile that you can +use to build Google Test on systems where GNU make is available +(e.g. Linux, Mac OS X, and Cygwin). It doesn't try to build Google +Test's own tests. Instead, it just builds the Google Test library and +a sample test. You can use it as a starting point for your own build +script. + +If the default settings are correct for your environment, the +following commands should succeed: + + cd ${GTEST_DIR}/make + make + ./sample1_unittest + +If you see errors, try to tweak the contents of `make/Makefile` to make +them go away. There are instructions in `make/Makefile` on how to do +it. + +### Using CMake ### + +Google Test comes with a CMake build script ( +[CMakeLists.txt](CMakeLists.txt)) that can be used on a wide range of platforms ("C" stands for +cross-platform.). If you don't have CMake installed already, you can +download it for free from . + +CMake works by generating native makefiles or build projects that can +be used in the compiler environment of your choice. The typical +workflow starts with: + + mkdir mybuild # Create a directory to hold the build output. + cd mybuild + cmake ${GTEST_DIR} # Generate native build scripts. + +If you want to build Google Test's samples, you should replace the +last command with + + cmake -Dgtest_build_samples=ON ${GTEST_DIR} + +If you are on a \*nix system, you should now see a Makefile in the +current directory. Just type 'make' to build gtest. + +If you use Windows and have Visual Studio installed, a `gtest.sln` file +and several `.vcproj` files will be created. You can then build them +using Visual Studio. + +On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated. + +### Legacy Build Scripts ### + +Before settling on CMake, we have been providing hand-maintained build +projects/scripts for Visual Studio, Xcode, and Autotools. While we +continue to provide them for convenience, they are not actively +maintained any more. We highly recommend that you follow the +instructions in the previous two sections to integrate Google Test +with your existing build system. + +If you still need to use the legacy build scripts, here's how: + +The msvc\ folder contains two solutions with Visual C++ projects. +Open the `gtest.sln` or `gtest-md.sln` file using Visual Studio, and you +are ready to build Google Test the same way you build any Visual +Studio project. Files that have names ending with -md use DLL +versions of Microsoft runtime libraries (the /MD or the /MDd compiler +option). Files without that suffix use static versions of the runtime +libraries (the /MT or the /MTd option). Please note that one must use +the same option to compile both gtest and the test code. If you use +Visual Studio 2005 or above, we recommend the -md version as /MD is +the default for new projects in these versions of Visual Studio. + +On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using +Xcode. Build the "gtest" target. The universal binary framework will +end up in your selected build directory (selected in the Xcode +"Preferences..." -> "Building" pane and defaults to xcode/build). +Alternatively, at the command line, enter: + + xcodebuild + +This will build the "Release" configuration of gtest.framework in your +default build location. See the "xcodebuild" man page for more +information about building different configurations and building in +different locations. + +If you wish to use the Google Test Xcode project with Xcode 4.x and +above, you need to either: + + * update the SDK configuration options in xcode/Config/General.xconfig. + Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If + you choose this route you lose the ability to target earlier versions + of MacOS X. + * Install an SDK for an earlier version. This doesn't appear to be + supported by Apple, but has been reported to work + (http://stackoverflow.com/questions/5378518). + +### Tweaking Google Test ### + +Google Test can be used in diverse environments. The default +configuration may not work (or may not work well) out of the box in +some environments. However, you can easily tweak Google Test by +defining control macros on the compiler command line. Generally, +these macros are named like `GTEST_XYZ` and you define them to either 1 +or 0 to enable or disable a certain feature. + +We list the most frequently used macros below. For a complete list, +see file [include/gtest/internal/gtest-port.h](include/gtest/internal/gtest-port.h). + +### Choosing a TR1 Tuple Library ### + +Some Google Test features require the C++ Technical Report 1 (TR1) +tuple library, which is not yet available with all compilers. The +good news is that Google Test implements a subset of TR1 tuple that's +enough for its own need, and will automatically use this when the +compiler doesn't provide TR1 tuple. + +Usually you don't need to care about which tuple library Google Test +uses. However, if your project already uses TR1 tuple, you need to +tell Google Test to use the same TR1 tuple library the rest of your +project uses, or the two tuple implementations will clash. To do +that, add + + -DGTEST_USE_OWN_TR1_TUPLE=0 + +to the compiler flags while compiling Google Test and your tests. If +you want to force Google Test to use its own tuple library, just add + + -DGTEST_USE_OWN_TR1_TUPLE=1 + +to the compiler flags instead. + +If you don't want Google Test to use tuple at all, add + + -DGTEST_HAS_TR1_TUPLE=0 + +and all features using tuple will be disabled. + +### Multi-threaded Tests ### + +Google Test is thread-safe where the pthread library is available. +After `#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` +macro to see whether this is the case (yes if the macro is `#defined` to +1, no if it's undefined.). + +If Google Test doesn't correctly detect whether pthread is available +in your environment, you can force it with + + -DGTEST_HAS_PTHREAD=1 + +or + + -DGTEST_HAS_PTHREAD=0 + +When Google Test uses pthread, you may need to add flags to your +compiler and/or linker to select the pthread library, or you'll get +link errors. If you use the CMake script or the deprecated Autotools +script, this is taken care of for you. If you use your own build +script, you'll need to read your compiler and linker's manual to +figure out what flags to add. + +### As a Shared Library (DLL) ### + +Google Test is compact, so most users can build and link it as a +static library for the simplicity. You can choose to use Google Test +as a shared library (known as a DLL on Windows) if you prefer. + +To compile *gtest* as a shared library, add + + -DGTEST_CREATE_SHARED_LIBRARY=1 + +to the compiler flags. You'll also need to tell the linker to produce +a shared library instead - consult your linker's manual for how to do +it. + +To compile your *tests* that use the gtest shared library, add + + -DGTEST_LINKED_AS_SHARED_LIBRARY=1 + +to the compiler flags. + +Note: while the above steps aren't technically necessary today when +using some compilers (e.g. GCC), they may become necessary in the +future, if we decide to improve the speed of loading the library (see + for details). Therefore you are +recommended to always add the above flags when using Google Test as a +shared library. Otherwise a future release of Google Test may break +your build script. + +### Avoiding Macro Name Clashes ### + +In C++, macros don't obey namespaces. Therefore two libraries that +both define a macro of the same name will clash if you `#include` both +definitions. In case a Google Test macro clashes with another +library, you can force Google Test to rename its macro to avoid the +conflict. + +Specifically, if both Google Test and some other code define macro +FOO, you can add + + -DGTEST_DONT_DEFINE_FOO=1 + +to the compiler flags to tell Google Test to change the macro's name +from `FOO` to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, +or `TEST`. For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll +need to write + + GTEST_TEST(SomeTest, DoesThis) { ... } + +instead of + + TEST(SomeTest, DoesThis) { ... } + +in order to define a test. + +## Developing Google Test ## + +This section discusses how to make your own changes to Google Test. + +### Testing Google Test Itself ### + +To make sure your changes work as intended and don't break existing +functionality, you'll want to compile and run Google Test's own tests. +For that you can use CMake: + + mkdir mybuild + cd mybuild + cmake -Dgtest_build_tests=ON ${GTEST_DIR} + +Make sure you have Python installed, as some of Google Test's tests +are written in Python. If the cmake command complains about not being +able to find Python (`Could NOT find PythonInterp (missing: +PYTHON_EXECUTABLE)`), try telling it explicitly where your Python +executable can be found: + + cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR} + +Next, you can build Google Test and all of its own tests. On \*nix, +this is usually done by 'make'. To run the tests, do + + make test + +All tests should pass. + +Normally you don't need to worry about regenerating the source files, +unless you need to modify them. In that case, you should modify the +corresponding .pump files instead and run the pump.py Python script to +regenerate them. You can find pump.py in the [scripts/](scripts/) directory. +Read the [Pump manual](docs/PumpManual.md) for how to use it. diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h new file mode 100644 index 0000000000..957a69c6a9 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h @@ -0,0 +1,294 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the public API for death tests. It is +// #included by gtest.h so a user doesn't need to include this +// directly. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ + +#include "gtest/internal/gtest-death-test-internal.h" + +namespace testing { + +// This flag controls the style of death tests. Valid values are "threadsafe", +// meaning that the death test child process will re-execute the test binary +// from the start, running only a single death test, or "fast", +// meaning that the child process will execute the test logic immediately +// after forking. +GTEST_DECLARE_string_(death_test_style); + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +GTEST_API_ bool InDeathTestChild(); + +} // namespace internal + +// The following macros are useful for writing death tests. + +// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is +// executed: +// +// 1. It generates a warning if there is more than one active +// thread. This is because it's safe to fork() or clone() only +// when there is a single thread. +// +// 2. The parent process clone()s a sub-process and runs the death +// test in it; the sub-process exits with code 0 at the end of the +// death test, if it hasn't exited already. +// +// 3. The parent process waits for the sub-process to terminate. +// +// 4. The parent process checks the exit code and error message of +// the sub-process. +// +// Examples: +// +// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); +// for (int i = 0; i < 5; i++) { +// EXPECT_DEATH(server.ProcessRequest(i), +// "Invalid request .* in ProcessRequest()") +// << "Failed to die on request " << i; +// } +// +// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); +// +// bool KilledBySIGHUP(int exit_code) { +// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; +// } +// +// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); +// +// On the regular expressions used in death tests: +// +// On POSIX-compliant systems (*nix), we use the library, +// which uses the POSIX extended regex syntax. +// +// On other platforms (e.g. Windows), we only support a simple regex +// syntax implemented as part of Google Test. This limited +// implementation should be enough most of the time when writing +// death tests; though it lacks many features you can find in PCRE +// or POSIX extended regex syntax. For example, we don't support +// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and +// repetition count ("x{5,7}"), among others. +// +// Below is the syntax that we do support. We chose it to be a +// subset of both PCRE and POSIX extended regex, so it's easy to +// learn wherever you come from. In the following: 'A' denotes a +// literal character, period (.), or a single \\ escape sequence; +// 'x' and 'y' denote regular expressions; 'm' and 'n' are for +// natural numbers. +// +// c matches any literal character c +// \\d matches any decimal digit +// \\D matches any character that's not a decimal digit +// \\f matches \f +// \\n matches \n +// \\r matches \r +// \\s matches any ASCII whitespace, including \n +// \\S matches any character that's not a whitespace +// \\t matches \t +// \\v matches \v +// \\w matches any letter, _, or decimal digit +// \\W matches any character that \\w doesn't match +// \\c matches any literal character c, which must be a punctuation +// . matches any single character except \n +// A? matches 0 or 1 occurrences of A +// A* matches 0 or many occurrences of A +// A+ matches 1 or many occurrences of A +// ^ matches the beginning of a string (not that of each line) +// $ matches the end of a string (not that of each line) +// xy matches x followed by y +// +// If you accidentally use PCRE or POSIX extended regex features +// not implemented by us, you will get a run-time failure. In that +// case, please try to rewrite your regular expression within the +// above syntax. +// +// This implementation is *not* meant to be as highly tuned or robust +// as a compiled regex library, but should perform well enough for a +// death test, which already incurs significant overhead by launching +// a child process. +// +// Known caveats: +// +// A "threadsafe" style death test obtains the path to the test +// program from argv[0] and re-executes it in the sub-process. For +// simplicity, the current implementation doesn't search the PATH +// when launching the sub-process. This means that the user must +// invoke the test program via a path that contains at least one +// path separator (e.g. path/to/foo_test and +// /absolute/path/to/bar_test are fine, but foo_test is not). This +// is rarely a problem as people usually don't put the test binary +// directory in PATH. +// +// TODO(wan@google.com): make thread-safe death tests search the PATH. + +// Asserts that a given statement causes the program to exit, with an +// integer exit status that satisfies predicate, and emitting error output +// that matches regex. +# define ASSERT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) + +// Like ASSERT_EXIT, but continues on to successive tests in the +// test case, if any: +# define EXPECT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) + +// Asserts that a given statement causes the program to exit, either by +// explicitly exiting with a nonzero exit code or being killed by a +// signal, and emitting error output that matches regex. +# define ASSERT_DEATH(statement, regex) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Like ASSERT_DEATH, but continues on to successive tests in the +// test case, if any: +# define EXPECT_DEATH(statement, regex) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: + +// Tests that an exit code describes a normal exit with a given exit code. +class GTEST_API_ ExitedWithCode { + public: + explicit ExitedWithCode(int exit_code); + bool operator()(int exit_status) const; + private: + // No implementation - assignment is unsupported. + void operator=(const ExitedWithCode& other); + + const int exit_code_; +}; + +# if !GTEST_OS_WINDOWS +// Tests that an exit code describes an exit due to termination by a +// given signal. +class GTEST_API_ KilledBySignal { + public: + explicit KilledBySignal(int signum); + bool operator()(int exit_status) const; + private: + const int signum_; +}; +# endif // !GTEST_OS_WINDOWS + +// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. +// The death testing framework causes this to have interesting semantics, +// since the sideeffects of the call are only visible in opt mode, and not +// in debug mode. +// +// In practice, this can be used to test functions that utilize the +// LOG(DFATAL) macro using the following style: +// +// int DieInDebugOr12(int* sideeffect) { +// if (sideeffect) { +// *sideeffect = 12; +// } +// LOG(DFATAL) << "death"; +// return 12; +// } +// +// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) { +// int sideeffect = 0; +// // Only asserts in dbg. +// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); +// +// #ifdef NDEBUG +// // opt-mode has sideeffect visible. +// EXPECT_EQ(12, sideeffect); +// #else +// // dbg-mode no visible sideeffect. +// EXPECT_EQ(0, sideeffect); +// #endif +// } +// +// This will assert that DieInDebugReturn12InOpt() crashes in debug +// mode, usually due to a DCHECK or LOG(DFATAL), but returns the +// appropriate fallback value (12 in this case) in opt mode. If you +// need to test that a function has appropriate side-effects in opt +// mode, include assertions against the side-effects. A general +// pattern for this is: +// +// EXPECT_DEBUG_DEATH({ +// // Side-effects here will have an effect after this statement in +// // opt mode, but none in debug mode. +// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); +// }, "death"); +// +# ifdef NDEBUG + +# define EXPECT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +# define ASSERT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +# else + +# define EXPECT_DEBUG_DEATH(statement, regex) \ + EXPECT_DEATH(statement, regex) + +# define ASSERT_DEBUG_DEATH(statement, regex) \ + ASSERT_DEATH(statement, regex) + +# endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // GTEST_HAS_DEATH_TEST + +// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and +// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if +// death tests are supported; otherwise they just issue a warning. This is +// useful when you are combining death test assertions with normal test +// assertions in one test. +#if GTEST_HAS_DEATH_TEST +# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) +#else +# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, ) +# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return) +#endif + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h new file mode 100644 index 0000000000..fe879bca79 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h @@ -0,0 +1,250 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the Message class. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! + +#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ + +#include + +#include "gtest/internal/gtest-port.h" + +// Ensures that there is at least one operator<< in the global namespace. +// See Message& operator<<(...) below for why. +void operator<<(const testing::internal::Secret&, int); + +namespace testing { + +// The Message class works like an ostream repeater. +// +// Typical usage: +// +// 1. You stream a bunch of values to a Message object. +// It will remember the text in a stringstream. +// 2. Then you stream the Message object to an ostream. +// This causes the text in the Message to be streamed +// to the ostream. +// +// For example; +// +// testing::Message foo; +// foo << 1 << " != " << 2; +// std::cout << foo; +// +// will print "1 != 2". +// +// Message is not intended to be inherited from. In particular, its +// destructor is not virtual. +// +// Note that stringstream behaves differently in gcc and in MSVC. You +// can stream a NULL char pointer to it in the former, but not in the +// latter (it causes an access violation if you do). The Message +// class hides this difference by treating a NULL char pointer as +// "(null)". +class GTEST_API_ Message { + private: + // The type of basic IO manipulators (endl, ends, and flush) for + // narrow streams. + typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&); + + public: + // Constructs an empty Message. + Message(); + + // Copy constructor. + Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT + *ss_ << msg.GetString(); + } + + // Constructs a Message from a C-string. + explicit Message(const char* str) : ss_(new ::std::stringstream) { + *ss_ << str; + } + +#if GTEST_OS_SYMBIAN + // Streams a value (either a pointer or not) to this object. + template + inline Message& operator <<(const T& value) { + StreamHelper(typename internal::is_pointer::type(), value); + return *this; + } +#else + // Streams a non-pointer value to this object. + template + inline Message& operator <<(const T& val) { + // Some libraries overload << for STL containers. These + // overloads are defined in the global namespace instead of ::std. + // + // C++'s symbol lookup rule (i.e. Koenig lookup) says that these + // overloads are visible in either the std namespace or the global + // namespace, but not other namespaces, including the testing + // namespace which Google Test's Message class is in. + // + // To allow STL containers (and other types that has a << operator + // defined in the global namespace) to be used in Google Test + // assertions, testing::Message must access the custom << operator + // from the global namespace. With this using declaration, + // overloads of << defined in the global namespace and those + // visible via Koenig lookup are both exposed in this function. + using ::operator <<; + *ss_ << val; + return *this; + } + + // Streams a pointer value to this object. + // + // This function is an overload of the previous one. When you + // stream a pointer to a Message, this definition will be used as it + // is more specialized. (The C++ Standard, section + // [temp.func.order].) If you stream a non-pointer, then the + // previous definition will be used. + // + // The reason for this overload is that streaming a NULL pointer to + // ostream is undefined behavior. Depending on the compiler, you + // may get "0", "(nil)", "(null)", or an access violation. To + // ensure consistent result across compilers, we always treat NULL + // as "(null)". + template + inline Message& operator <<(T* const& pointer) { // NOLINT + if (pointer == NULL) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + return *this; + } +#endif // GTEST_OS_SYMBIAN + + // Since the basic IO manipulators are overloaded for both narrow + // and wide streams, we have to provide this specialized definition + // of operator <<, even though its body is the same as the + // templatized version above. Without this definition, streaming + // endl or other basic IO manipulators to Message will confuse the + // compiler. + Message& operator <<(BasicNarrowIoManip val) { + *ss_ << val; + return *this; + } + + // Instead of 1/0, we want to see true/false for bool values. + Message& operator <<(bool b) { + return *this << (b ? "true" : "false"); + } + + // These two overloads allow streaming a wide C string to a Message + // using the UTF-8 encoding. + Message& operator <<(const wchar_t* wide_c_str); + Message& operator <<(wchar_t* wide_c_str); + +#if GTEST_HAS_STD_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator <<(const ::std::wstring& wstr); +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_GLOBAL_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator <<(const ::wstring& wstr); +#endif // GTEST_HAS_GLOBAL_WSTRING + + // Gets the text streamed to this object so far as an std::string. + // Each '\0' character in the buffer is replaced with "\\0". + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + std::string GetString() const; + + private: + +#if GTEST_OS_SYMBIAN + // These are needed as the Nokia Symbian Compiler cannot decide between + // const T& and const T* in a function template. The Nokia compiler _can_ + // decide between class template specializations for T and T*, so a + // tr1::type_traits-like is_pointer works, and we can overload on that. + template + inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) { + if (pointer == NULL) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + } + template + inline void StreamHelper(internal::false_type /*is_pointer*/, + const T& value) { + // See the comments in Message& operator <<(const T&) above for why + // we need this using statement. + using ::operator <<; + *ss_ << value; + } +#endif // GTEST_OS_SYMBIAN + + // We'll hold the text streamed to this object here. + const internal::scoped_ptr< ::std::stringstream> ss_; + + // We declare (but don't implement) this to prevent the compiler + // from implementing the assignment operator. + void operator=(const Message&); +}; + +// Streams a Message to an ostream. +inline std::ostream& operator <<(std::ostream& os, const Message& sb) { + return os << sb.GetString(); +} + +namespace internal { + +// Converts a streamable value to an std::string. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". +template +std::string StreamableToString(const T& streamable) { + return (Message() << streamable).GetString(); +} + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h new file mode 100644 index 0000000000..038f9ba79e --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -0,0 +1,1444 @@ +// This file was GENERATED by command: +// pump.py gtest-param-test.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: vladl@google.com (Vlad Losev) +// +// Macros and functions for implementing parameterized tests +// in Google C++ Testing Framework (Google Test) +// +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ + + +// Value-parameterized tests allow you to test your code with different +// parameters without writing multiple copies of the same test. +// +// Here is how you use value-parameterized tests: + +#if 0 + +// To write value-parameterized tests, first you should define a fixture +// class. It is usually derived from testing::TestWithParam (see below for +// another inheritance scheme that's sometimes useful in more complicated +// class hierarchies), where the type of your parameter values. +// TestWithParam is itself derived from testing::Test. T can be any +// copyable type. If it's a raw pointer, you are responsible for managing the +// lifespan of the pointed values. + +class FooTest : public ::testing::TestWithParam { + // You can implement all the usual class fixture members here. +}; + +// Then, use the TEST_P macro to define as many parameterized tests +// for this fixture as you want. The _P suffix is for "parameterized" +// or "pattern", whichever you prefer to think. + +TEST_P(FooTest, DoesBlah) { + // Inside a test, access the test parameter with the GetParam() method + // of the TestWithParam class: + EXPECT_TRUE(foo.Blah(GetParam())); + ... +} + +TEST_P(FooTest, HasBlahBlah) { + ... +} + +// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test +// case with any set of parameters you want. Google Test defines a number +// of functions for generating test parameters. They return what we call +// (surprise!) parameter generators. Here is a summary of them, which +// are all in the testing namespace: +// +// +// Range(begin, end [, step]) - Yields values {begin, begin+step, +// begin+step+step, ...}. The values do not +// include end. step defaults to 1. +// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. +// ValuesIn(container) - Yields values from a C-style array, an STL +// ValuesIn(begin,end) container, or an iterator range [begin, end). +// Bool() - Yields sequence {false, true}. +// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product +// for the math savvy) of the values generated +// by the N generators. +// +// For more details, see comments at the definitions of these functions below +// in this file. +// +// The following statement will instantiate tests from the FooTest test case +// each with parameter values "meeny", "miny", and "moe". + +INSTANTIATE_TEST_CASE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); + +// To distinguish different instances of the pattern, (yes, you +// can instantiate it more then once) the first argument to the +// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the +// actual test case name. Remember to pick unique prefixes for different +// instantiations. The tests from the instantiation above will have +// these names: +// +// * InstantiationName/FooTest.DoesBlah/0 for "meeny" +// * InstantiationName/FooTest.DoesBlah/1 for "miny" +// * InstantiationName/FooTest.DoesBlah/2 for "moe" +// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" +// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" +// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" +// +// You can use these names in --gtest_filter. +// +// This statement will instantiate all tests from FooTest again, each +// with parameter values "cat" and "dog": + +const char* pets[] = {"cat", "dog"}; +INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); + +// The tests from the instantiation above will have these names: +// +// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" +// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" +// +// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests +// in the given test case, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_CASE_P statement. +// +// Please also note that generator expressions (including parameters to the +// generators) are evaluated in InitGoogleTest(), after main() has started. +// This allows the user on one hand, to adjust generator parameters in order +// to dynamically determine a set of tests to run and on the other hand, +// give the user a chance to inspect the generated tests with Google Test +// reflection API before RUN_ALL_TESTS() is executed. +// +// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc +// for more examples. +// +// In the future, we plan to publish the API for defining new parameter +// generators. But for now this interface remains part of the internal +// implementation and is subject to change. +// +// +// A parameterized test fixture must be derived from testing::Test and from +// testing::WithParamInterface, where T is the type of the parameter +// values. Inheriting from TestWithParam satisfies that requirement because +// TestWithParam inherits from both Test and WithParamInterface. In more +// complicated hierarchies, however, it is occasionally useful to inherit +// separately from Test and WithParamInterface. For example: + +class BaseTest : public ::testing::Test { + // You can inherit all the usual members for a non-parameterized test + // fixture here. +}; + +class DerivedTest : public BaseTest, public ::testing::WithParamInterface { + // The usual test fixture members go here too. +}; + +TEST_F(BaseTest, HasFoo) { + // This is an ordinary non-parameterized test. +} + +TEST_P(DerivedTest, DoesBlah) { + // GetParam works just the same here as if you inherit from TestWithParam. + EXPECT_TRUE(foo.Blah(GetParam())); +} + +#endif // 0 + +#include "gtest/internal/gtest-port.h" + +#if !GTEST_OS_SYMBIAN +# include +#endif + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-param-util.h" +#include "gtest/internal/gtest-param-util-generated.h" + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Functions producing parameter generators. +// +// Google Test uses these generators to produce parameters for value- +// parameterized tests. When a parameterized test case is instantiated +// with a particular generator, Google Test creates and runs tests +// for each element in the sequence produced by the generator. +// +// In the following sample, tests from test case FooTest are instantiated +// each three times with parameter values 3, 5, and 8: +// +// class FooTest : public TestWithParam { ... }; +// +// TEST_P(FooTest, TestThis) { +// } +// TEST_P(FooTest, TestThat) { +// } +// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); +// + +// Range() returns generators providing sequences of values in a range. +// +// Synopsis: +// Range(start, end) +// - returns a generator producing a sequence of values {start, start+1, +// start+2, ..., }. +// Range(start, end, step) +// - returns a generator producing a sequence of values {start, start+step, +// start+step+step, ..., }. +// Notes: +// * The generated sequences never include end. For example, Range(1, 5) +// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) +// returns a generator producing {1, 3, 5, 7}. +// * start and end must have the same type. That type may be any integral or +// floating-point type or a user defined type satisfying these conditions: +// * It must be assignable (have operator=() defined). +// * It must have operator+() (operator+(int-compatible type) for +// two-operand version). +// * It must have operator<() defined. +// Elements in the resulting sequences will also have that type. +// * Condition start < end must be satisfied in order for resulting sequences +// to contain any elements. +// +template +internal::ParamGenerator Range(T start, T end, IncrementT step) { + return internal::ParamGenerator( + new internal::RangeGenerator(start, end, step)); +} + +template +internal::ParamGenerator Range(T start, T end) { + return Range(start, end, 1); +} + +// ValuesIn() function allows generation of tests with parameters coming from +// a container. +// +// Synopsis: +// ValuesIn(const T (&array)[N]) +// - returns a generator producing sequences with elements from +// a C-style array. +// ValuesIn(const Container& container) +// - returns a generator producing sequences with elements from +// an STL-style container. +// ValuesIn(Iterator begin, Iterator end) +// - returns a generator producing sequences with elements from +// a range [begin, end) defined by a pair of STL-style iterators. These +// iterators can also be plain C pointers. +// +// Please note that ValuesIn copies the values from the containers +// passed in and keeps them to generate tests in RUN_ALL_TESTS(). +// +// Examples: +// +// This instantiates tests from test case StringTest +// each with C-string values of "foo", "bar", and "baz": +// +// const char* strings[] = {"foo", "bar", "baz"}; +// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings)); +// +// This instantiates tests from test case StlStringTest +// each with STL strings with values "a" and "b": +// +// ::std::vector< ::std::string> GetParameterStrings() { +// ::std::vector< ::std::string> v; +// v.push_back("a"); +// v.push_back("b"); +// return v; +// } +// +// INSTANTIATE_TEST_CASE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); +// +// +// This will also instantiate tests from CharTest +// each with parameter values 'a' and 'b': +// +// ::std::list GetParameterChars() { +// ::std::list list; +// list.push_back('a'); +// list.push_back('b'); +// return list; +// } +// ::std::list l = GetParameterChars(); +// INSTANTIATE_TEST_CASE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); +// +template +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end) { + typedef typename ::testing::internal::IteratorTraits + ::value_type ParamType; + return internal::ParamGenerator( + new internal::ValuesInIteratorRangeGenerator(begin, end)); +} + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]) { + return ValuesIn(array, array + N); +} + +template +internal::ParamGenerator ValuesIn( + const Container& container) { + return ValuesIn(container.begin(), container.end()); +} + +// Values() allows generating tests from explicitly specified list of +// parameters. +// +// Synopsis: +// Values(T v1, T v2, ..., T vN) +// - returns a generator producing sequences with elements v1, v2, ..., vN. +// +// For example, this instantiates tests from test case BarTest each +// with values "one", "two", and "three": +// +// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); +// +// This instantiates tests from test case BazTest each with values 1, 2, 3.5. +// The exact type of values will depend on the type of parameter in BazTest. +// +// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// +// Currently, Values() supports from 1 to 50 parameters. +// +template +internal::ValueArray1 Values(T1 v1) { + return internal::ValueArray1(v1); +} + +template +internal::ValueArray2 Values(T1 v1, T2 v2) { + return internal::ValueArray2(v1, v2); +} + +template +internal::ValueArray3 Values(T1 v1, T2 v2, T3 v3) { + return internal::ValueArray3(v1, v2, v3); +} + +template +internal::ValueArray4 Values(T1 v1, T2 v2, T3 v3, T4 v4) { + return internal::ValueArray4(v1, v2, v3, v4); +} + +template +internal::ValueArray5 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5) { + return internal::ValueArray5(v1, v2, v3, v4, v5); +} + +template +internal::ValueArray6 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6) { + return internal::ValueArray6(v1, v2, v3, v4, v5, v6); +} + +template +internal::ValueArray7 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7) { + return internal::ValueArray7(v1, v2, v3, v4, v5, + v6, v7); +} + +template +internal::ValueArray8 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) { + return internal::ValueArray8(v1, v2, v3, v4, + v5, v6, v7, v8); +} + +template +internal::ValueArray9 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) { + return internal::ValueArray9(v1, v2, v3, + v4, v5, v6, v7, v8, v9); +} + +template +internal::ValueArray10 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) { + return internal::ValueArray10(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10); +} + +template +internal::ValueArray11 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11) { + return internal::ValueArray11(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11); +} + +template +internal::ValueArray12 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12) { + return internal::ValueArray12(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12); +} + +template +internal::ValueArray13 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13) { + return internal::ValueArray13(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13); +} + +template +internal::ValueArray14 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) { + return internal::ValueArray14(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14); +} + +template +internal::ValueArray15 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) { + return internal::ValueArray15(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); +} + +template +internal::ValueArray16 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16) { + return internal::ValueArray16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16); +} + +template +internal::ValueArray17 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17) { + return internal::ValueArray17(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17); +} + +template +internal::ValueArray18 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18) { + return internal::ValueArray18(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18); +} + +template +internal::ValueArray19 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) { + return internal::ValueArray19(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19); +} + +template +internal::ValueArray20 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) { + return internal::ValueArray20(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20); +} + +template +internal::ValueArray21 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) { + return internal::ValueArray21(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21); +} + +template +internal::ValueArray22 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22) { + return internal::ValueArray22(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22); +} + +template +internal::ValueArray23 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23) { + return internal::ValueArray23(v1, v2, v3, + v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23); +} + +template +internal::ValueArray24 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24) { + return internal::ValueArray24(v1, v2, + v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24); +} + +template +internal::ValueArray25 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, + T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, + T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) { + return internal::ValueArray25(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + v18, v19, v20, v21, v22, v23, v24, v25); +} + +template +internal::ValueArray26 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26) { + return internal::ValueArray26(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26); +} + +template +internal::ValueArray27 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27) { + return internal::ValueArray27(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, + v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27); +} + +template +internal::ValueArray28 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28) { + return internal::ValueArray28(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, + v28); +} + +template +internal::ValueArray29 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29) { + return internal::ValueArray29(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, + v27, v28, v29); +} + +template +internal::ValueArray30 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) { + return internal::ValueArray30(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, + v26, v27, v28, v29, v30); +} + +template +internal::ValueArray31 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) { + return internal::ValueArray31(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, + v25, v26, v27, v28, v29, v30, v31); +} + +template +internal::ValueArray32 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32) { + return internal::ValueArray32(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32); +} + +template +internal::ValueArray33 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33) { + return internal::ValueArray33(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33); +} + +template +internal::ValueArray34 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, + T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, + T31 v31, T32 v32, T33 v33, T34 v34) { + return internal::ValueArray34(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, + v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34); +} + +template +internal::ValueArray35 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) { + return internal::ValueArray35(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35); +} + +template +internal::ValueArray36 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) { + return internal::ValueArray36(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36); +} + +template +internal::ValueArray37 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37) { + return internal::ValueArray37(v1, v2, v3, + v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36, v37); +} + +template +internal::ValueArray38 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37, T38 v38) { + return internal::ValueArray38(v1, v2, + v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, + v33, v34, v35, v36, v37, v38); +} + +template +internal::ValueArray39 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37, T38 v38, T39 v39) { + return internal::ValueArray39(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39); +} + +template +internal::ValueArray40 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, + T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, + T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, + T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, + T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) { + return internal::ValueArray40(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, + v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40); +} + +template +internal::ValueArray41 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) { + return internal::ValueArray41(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, + v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, + v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41); +} + +template +internal::ValueArray42 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42) { + return internal::ValueArray42(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, + v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, + v42); +} + +template +internal::ValueArray43 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43) { + return internal::ValueArray43(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, + v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, + v41, v42, v43); +} + +template +internal::ValueArray44 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44) { + return internal::ValueArray44(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, + v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, + v40, v41, v42, v43, v44); +} + +template +internal::ValueArray45 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, + T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, + T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) { + return internal::ValueArray45(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, + v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, + v39, v40, v41, v42, v43, v44, v45); +} + +template +internal::ValueArray46 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) { + return internal::ValueArray46(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, + v38, v39, v40, v41, v42, v43, v44, v45, v46); +} + +template +internal::ValueArray47 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) { + return internal::ValueArray47(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, + v38, v39, v40, v41, v42, v43, v44, v45, v46, v47); +} + +template +internal::ValueArray48 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, + T48 v48) { + return internal::ValueArray48(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, + v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, + v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48); +} + +template +internal::ValueArray49 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, + T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, + T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, + T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, + T47 v47, T48 v48, T49 v49) { + return internal::ValueArray49(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, + v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49); +} + +template +internal::ValueArray50 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, + T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, + T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) { + return internal::ValueArray50(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50); +} + +// Bool() allows generating tests with parameters in a set of (false, true). +// +// Synopsis: +// Bool() +// - returns a generator producing sequences with elements {false, true}. +// +// It is useful when testing code that depends on Boolean flags. Combinations +// of multiple flags can be tested when several Bool()'s are combined using +// Combine() function. +// +// In the following example all tests in the test case FlagDependentTest +// will be instantiated twice with parameters false and true. +// +// class FlagDependentTest : public testing::TestWithParam { +// virtual void SetUp() { +// external_flag = GetParam(); +// } +// } +// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool()); +// +inline internal::ParamGenerator Bool() { + return Values(false, true); +} + +# if GTEST_HAS_COMBINE +// Combine() allows the user to combine two or more sequences to produce +// values of a Cartesian product of those sequences' elements. +// +// Synopsis: +// Combine(gen1, gen2, ..., genN) +// - returns a generator producing sequences with elements coming from +// the Cartesian product of elements from the sequences generated by +// gen1, gen2, ..., genN. The sequence elements will have a type of +// tuple where T1, T2, ..., TN are the types +// of elements from sequences produces by gen1, gen2, ..., genN. +// +// Combine can have up to 10 arguments. This number is currently limited +// by the maximum number of elements in the tuple implementation used by Google +// Test. +// +// Example: +// +// This will instantiate tests in test case AnimalTest each one with +// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), +// tuple("dog", BLACK), and tuple("dog", WHITE): +// +// enum Color { BLACK, GRAY, WHITE }; +// class AnimalTest +// : public testing::TestWithParam > {...}; +// +// TEST_P(AnimalTest, AnimalLooksNice) {...} +// +// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); +// +// This will instantiate tests in FlagDependentTest with all variations of two +// Boolean flags: +// +// class FlagDependentTest +// : public testing::TestWithParam > { +// virtual void SetUp() { +// // Assigns external_flag_1 and external_flag_2 values from the tuple. +// tie(external_flag_1, external_flag_2) = GetParam(); +// } +// }; +// +// TEST_P(FlagDependentTest, TestFeature1) { +// // Test your code using external_flag_1 and external_flag_2 here. +// } +// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +template +internal::CartesianProductHolder2 Combine( + const Generator1& g1, const Generator2& g2) { + return internal::CartesianProductHolder2( + g1, g2); +} + +template +internal::CartesianProductHolder3 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3) { + return internal::CartesianProductHolder3( + g1, g2, g3); +} + +template +internal::CartesianProductHolder4 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4) { + return internal::CartesianProductHolder4( + g1, g2, g3, g4); +} + +template +internal::CartesianProductHolder5 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5) { + return internal::CartesianProductHolder5( + g1, g2, g3, g4, g5); +} + +template +internal::CartesianProductHolder6 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6) { + return internal::CartesianProductHolder6( + g1, g2, g3, g4, g5, g6); +} + +template +internal::CartesianProductHolder7 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7) { + return internal::CartesianProductHolder7( + g1, g2, g3, g4, g5, g6, g7); +} + +template +internal::CartesianProductHolder8 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8) { + return internal::CartesianProductHolder8( + g1, g2, g3, g4, g5, g6, g7, g8); +} + +template +internal::CartesianProductHolder9 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8, const Generator9& g9) { + return internal::CartesianProductHolder9( + g1, g2, g3, g4, g5, g6, g7, g8, g9); +} + +template +internal::CartesianProductHolder10 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8, const Generator9& g9, + const Generator10& g10) { + return internal::CartesianProductHolder10( + g1, g2, g3, g4, g5, g6, g7, g8, g9, g10); +} +# endif // GTEST_HAS_COMBINE + + + +# define TEST_P(test_case_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + : public test_case_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ + virtual void TestBody(); \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder(\ + #test_case_name, \ + ::testing::internal::CodeLocation(\ + __FILE__, __LINE__))->AddTestPattern(\ + #test_case_name, \ + #test_name, \ + new ::testing::internal::TestMetaFactory< \ + GTEST_TEST_CLASS_NAME_(\ + test_case_name, test_name)>()); \ + return 0; \ + } \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_case_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user +// to specify a function or functor that generates custom test name suffixes +// based on the test parameters. The function should accept one argument of +// type testing::TestParamInfo, and return std::string. +// +// testing::PrintToStringParamName is a builtin test suffix generator that +// returns the value of testing::PrintToString(GetParam()). It does not work +// for std::string or C strings. +// +// Note: test names must be non-empty, unique, and may only contain ASCII +// alphanumeric characters or underscore. + +# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \ + ::testing::internal::ParamGenerator \ + gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ + ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo& info) { \ + return ::testing::internal::GetParamNameGen \ + (__VA_ARGS__)(info); \ + } \ + int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder(\ + #test_case_name, \ + ::testing::internal::CodeLocation(\ + __FILE__, __LINE__))->AddTestCaseInstantiation(\ + #prefix, \ + >est_##prefix##test_case_name##_EvalGenerator_, \ + >est_##prefix##test_case_name##_EvalGenerateName_, \ + __FILE__, __LINE__) + +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump new file mode 100644 index 0000000000..3078d6d2a1 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump @@ -0,0 +1,510 @@ +$$ -*- mode: c++; -*- +$var n = 50 $$ Maximum length of Values arguments we want to support. +$var maxtuple = 10 $$ Maximum number of Combine arguments we want to support. +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: vladl@google.com (Vlad Losev) +// +// Macros and functions for implementing parameterized tests +// in Google C++ Testing Framework (Google Test) +// +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ + + +// Value-parameterized tests allow you to test your code with different +// parameters without writing multiple copies of the same test. +// +// Here is how you use value-parameterized tests: + +#if 0 + +// To write value-parameterized tests, first you should define a fixture +// class. It is usually derived from testing::TestWithParam (see below for +// another inheritance scheme that's sometimes useful in more complicated +// class hierarchies), where the type of your parameter values. +// TestWithParam is itself derived from testing::Test. T can be any +// copyable type. If it's a raw pointer, you are responsible for managing the +// lifespan of the pointed values. + +class FooTest : public ::testing::TestWithParam { + // You can implement all the usual class fixture members here. +}; + +// Then, use the TEST_P macro to define as many parameterized tests +// for this fixture as you want. The _P suffix is for "parameterized" +// or "pattern", whichever you prefer to think. + +TEST_P(FooTest, DoesBlah) { + // Inside a test, access the test parameter with the GetParam() method + // of the TestWithParam class: + EXPECT_TRUE(foo.Blah(GetParam())); + ... +} + +TEST_P(FooTest, HasBlahBlah) { + ... +} + +// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test +// case with any set of parameters you want. Google Test defines a number +// of functions for generating test parameters. They return what we call +// (surprise!) parameter generators. Here is a summary of them, which +// are all in the testing namespace: +// +// +// Range(begin, end [, step]) - Yields values {begin, begin+step, +// begin+step+step, ...}. The values do not +// include end. step defaults to 1. +// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. +// ValuesIn(container) - Yields values from a C-style array, an STL +// ValuesIn(begin,end) container, or an iterator range [begin, end). +// Bool() - Yields sequence {false, true}. +// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product +// for the math savvy) of the values generated +// by the N generators. +// +// For more details, see comments at the definitions of these functions below +// in this file. +// +// The following statement will instantiate tests from the FooTest test case +// each with parameter values "meeny", "miny", and "moe". + +INSTANTIATE_TEST_CASE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); + +// To distinguish different instances of the pattern, (yes, you +// can instantiate it more then once) the first argument to the +// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the +// actual test case name. Remember to pick unique prefixes for different +// instantiations. The tests from the instantiation above will have +// these names: +// +// * InstantiationName/FooTest.DoesBlah/0 for "meeny" +// * InstantiationName/FooTest.DoesBlah/1 for "miny" +// * InstantiationName/FooTest.DoesBlah/2 for "moe" +// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" +// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" +// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" +// +// You can use these names in --gtest_filter. +// +// This statement will instantiate all tests from FooTest again, each +// with parameter values "cat" and "dog": + +const char* pets[] = {"cat", "dog"}; +INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); + +// The tests from the instantiation above will have these names: +// +// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" +// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" +// +// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests +// in the given test case, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_CASE_P statement. +// +// Please also note that generator expressions (including parameters to the +// generators) are evaluated in InitGoogleTest(), after main() has started. +// This allows the user on one hand, to adjust generator parameters in order +// to dynamically determine a set of tests to run and on the other hand, +// give the user a chance to inspect the generated tests with Google Test +// reflection API before RUN_ALL_TESTS() is executed. +// +// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc +// for more examples. +// +// In the future, we plan to publish the API for defining new parameter +// generators. But for now this interface remains part of the internal +// implementation and is subject to change. +// +// +// A parameterized test fixture must be derived from testing::Test and from +// testing::WithParamInterface, where T is the type of the parameter +// values. Inheriting from TestWithParam satisfies that requirement because +// TestWithParam inherits from both Test and WithParamInterface. In more +// complicated hierarchies, however, it is occasionally useful to inherit +// separately from Test and WithParamInterface. For example: + +class BaseTest : public ::testing::Test { + // You can inherit all the usual members for a non-parameterized test + // fixture here. +}; + +class DerivedTest : public BaseTest, public ::testing::WithParamInterface { + // The usual test fixture members go here too. +}; + +TEST_F(BaseTest, HasFoo) { + // This is an ordinary non-parameterized test. +} + +TEST_P(DerivedTest, DoesBlah) { + // GetParam works just the same here as if you inherit from TestWithParam. + EXPECT_TRUE(foo.Blah(GetParam())); +} + +#endif // 0 + +#include "gtest/internal/gtest-port.h" + +#if !GTEST_OS_SYMBIAN +# include +#endif + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-param-util.h" +#include "gtest/internal/gtest-param-util-generated.h" + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Functions producing parameter generators. +// +// Google Test uses these generators to produce parameters for value- +// parameterized tests. When a parameterized test case is instantiated +// with a particular generator, Google Test creates and runs tests +// for each element in the sequence produced by the generator. +// +// In the following sample, tests from test case FooTest are instantiated +// each three times with parameter values 3, 5, and 8: +// +// class FooTest : public TestWithParam { ... }; +// +// TEST_P(FooTest, TestThis) { +// } +// TEST_P(FooTest, TestThat) { +// } +// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); +// + +// Range() returns generators providing sequences of values in a range. +// +// Synopsis: +// Range(start, end) +// - returns a generator producing a sequence of values {start, start+1, +// start+2, ..., }. +// Range(start, end, step) +// - returns a generator producing a sequence of values {start, start+step, +// start+step+step, ..., }. +// Notes: +// * The generated sequences never include end. For example, Range(1, 5) +// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) +// returns a generator producing {1, 3, 5, 7}. +// * start and end must have the same type. That type may be any integral or +// floating-point type or a user defined type satisfying these conditions: +// * It must be assignable (have operator=() defined). +// * It must have operator+() (operator+(int-compatible type) for +// two-operand version). +// * It must have operator<() defined. +// Elements in the resulting sequences will also have that type. +// * Condition start < end must be satisfied in order for resulting sequences +// to contain any elements. +// +template +internal::ParamGenerator Range(T start, T end, IncrementT step) { + return internal::ParamGenerator( + new internal::RangeGenerator(start, end, step)); +} + +template +internal::ParamGenerator Range(T start, T end) { + return Range(start, end, 1); +} + +// ValuesIn() function allows generation of tests with parameters coming from +// a container. +// +// Synopsis: +// ValuesIn(const T (&array)[N]) +// - returns a generator producing sequences with elements from +// a C-style array. +// ValuesIn(const Container& container) +// - returns a generator producing sequences with elements from +// an STL-style container. +// ValuesIn(Iterator begin, Iterator end) +// - returns a generator producing sequences with elements from +// a range [begin, end) defined by a pair of STL-style iterators. These +// iterators can also be plain C pointers. +// +// Please note that ValuesIn copies the values from the containers +// passed in and keeps them to generate tests in RUN_ALL_TESTS(). +// +// Examples: +// +// This instantiates tests from test case StringTest +// each with C-string values of "foo", "bar", and "baz": +// +// const char* strings[] = {"foo", "bar", "baz"}; +// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings)); +// +// This instantiates tests from test case StlStringTest +// each with STL strings with values "a" and "b": +// +// ::std::vector< ::std::string> GetParameterStrings() { +// ::std::vector< ::std::string> v; +// v.push_back("a"); +// v.push_back("b"); +// return v; +// } +// +// INSTANTIATE_TEST_CASE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); +// +// +// This will also instantiate tests from CharTest +// each with parameter values 'a' and 'b': +// +// ::std::list GetParameterChars() { +// ::std::list list; +// list.push_back('a'); +// list.push_back('b'); +// return list; +// } +// ::std::list l = GetParameterChars(); +// INSTANTIATE_TEST_CASE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); +// +template +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end) { + typedef typename ::testing::internal::IteratorTraits + ::value_type ParamType; + return internal::ParamGenerator( + new internal::ValuesInIteratorRangeGenerator(begin, end)); +} + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]) { + return ValuesIn(array, array + N); +} + +template +internal::ParamGenerator ValuesIn( + const Container& container) { + return ValuesIn(container.begin(), container.end()); +} + +// Values() allows generating tests from explicitly specified list of +// parameters. +// +// Synopsis: +// Values(T v1, T v2, ..., T vN) +// - returns a generator producing sequences with elements v1, v2, ..., vN. +// +// For example, this instantiates tests from test case BarTest each +// with values "one", "two", and "three": +// +// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); +// +// This instantiates tests from test case BazTest each with values 1, 2, 3.5. +// The exact type of values will depend on the type of parameter in BazTest. +// +// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// +// Currently, Values() supports from 1 to $n parameters. +// +$range i 1..n +$for i [[ +$range j 1..i + +template <$for j, [[typename T$j]]> +internal::ValueArray$i<$for j, [[T$j]]> Values($for j, [[T$j v$j]]) { + return internal::ValueArray$i<$for j, [[T$j]]>($for j, [[v$j]]); +} + +]] + +// Bool() allows generating tests with parameters in a set of (false, true). +// +// Synopsis: +// Bool() +// - returns a generator producing sequences with elements {false, true}. +// +// It is useful when testing code that depends on Boolean flags. Combinations +// of multiple flags can be tested when several Bool()'s are combined using +// Combine() function. +// +// In the following example all tests in the test case FlagDependentTest +// will be instantiated twice with parameters false and true. +// +// class FlagDependentTest : public testing::TestWithParam { +// virtual void SetUp() { +// external_flag = GetParam(); +// } +// } +// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool()); +// +inline internal::ParamGenerator Bool() { + return Values(false, true); +} + +# if GTEST_HAS_COMBINE +// Combine() allows the user to combine two or more sequences to produce +// values of a Cartesian product of those sequences' elements. +// +// Synopsis: +// Combine(gen1, gen2, ..., genN) +// - returns a generator producing sequences with elements coming from +// the Cartesian product of elements from the sequences generated by +// gen1, gen2, ..., genN. The sequence elements will have a type of +// tuple where T1, T2, ..., TN are the types +// of elements from sequences produces by gen1, gen2, ..., genN. +// +// Combine can have up to $maxtuple arguments. This number is currently limited +// by the maximum number of elements in the tuple implementation used by Google +// Test. +// +// Example: +// +// This will instantiate tests in test case AnimalTest each one with +// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), +// tuple("dog", BLACK), and tuple("dog", WHITE): +// +// enum Color { BLACK, GRAY, WHITE }; +// class AnimalTest +// : public testing::TestWithParam > {...}; +// +// TEST_P(AnimalTest, AnimalLooksNice) {...} +// +// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); +// +// This will instantiate tests in FlagDependentTest with all variations of two +// Boolean flags: +// +// class FlagDependentTest +// : public testing::TestWithParam > { +// virtual void SetUp() { +// // Assigns external_flag_1 and external_flag_2 values from the tuple. +// tie(external_flag_1, external_flag_2) = GetParam(); +// } +// }; +// +// TEST_P(FlagDependentTest, TestFeature1) { +// // Test your code using external_flag_1 and external_flag_2 here. +// } +// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +$range i 2..maxtuple +$for i [[ +$range j 1..i + +template <$for j, [[typename Generator$j]]> +internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine( + $for j, [[const Generator$j& g$j]]) { + return internal::CartesianProductHolder$i<$for j, [[Generator$j]]>( + $for j, [[g$j]]); +} + +]] +# endif // GTEST_HAS_COMBINE + + + +# define TEST_P(test_case_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + : public test_case_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ + virtual void TestBody(); \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder(\ + #test_case_name, \ + ::testing::internal::CodeLocation(\ + __FILE__, __LINE__))->AddTestPattern(\ + #test_case_name, \ + #test_name, \ + new ::testing::internal::TestMetaFactory< \ + GTEST_TEST_CLASS_NAME_(\ + test_case_name, test_name)>()); \ + return 0; \ + } \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_case_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user +// to specify a function or functor that generates custom test name suffixes +// based on the test parameters. The function should accept one argument of +// type testing::TestParamInfo, and return std::string. +// +// testing::PrintToStringParamName is a builtin test suffix generator that +// returns the value of testing::PrintToString(GetParam()). +// +// Note: test names must be non-empty, unique, and may only contain ASCII +// alphanumeric characters or underscore. Because PrintToString adds quotes +// to std::string and C strings, it won't work for these types. + +# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \ + ::testing::internal::ParamGenerator \ + gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ + ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo& info) { \ + return ::testing::internal::GetParamNameGen \ + (__VA_ARGS__)(info); \ + } \ + int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder(\ + #test_case_name, \ + ::testing::internal::CodeLocation(\ + __FILE__, __LINE__))->AddTestCaseInstantiation(\ + #prefix, \ + >est_##prefix##test_case_name##_EvalGenerator_, \ + >est_##prefix##test_case_name##_EvalGenerateName_, \ + __FILE__, __LINE__) + +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h new file mode 100644 index 0000000000..8a33164cb3 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h @@ -0,0 +1,993 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Google Test - The Google C++ Testing Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// A user can teach this function how to print a class type T by +// defining either operator<<() or PrintTo() in the namespace that +// defines T. More specifically, the FIRST defined function in the +// following list will be used (assuming T is defined in namespace +// foo): +// +// 1. foo::PrintTo(const T&, ostream*) +// 2. operator<<(ostream&, const T&) defined in either foo or the +// global namespace. +// +// If none of the above is defined, it will print the debug string of +// the value if it is a protocol buffer, or print the raw bytes in the +// value otherwise. +// +// To aid debugging: when T is a reference type, the address of the +// value is also printed; when T is a (const) char pointer, both the +// pointer value and the NUL-terminated string it points to are +// printed. +// +// We also provide some convenient wrappers: +// +// // Prints a value to a string. For a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// std::string ::testing::PrintToString(const T& value); +// +// // Prints a value tersely: for a reference type, the referenced +// // value (but not the address) is printed; for a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); +// +// // Prints value using the type inferred by the compiler. The difference +// // from UniversalTersePrint() is that this function prints both the +// // pointer and the NUL-terminated string for a (const or not) char pointer. +// void ::testing::internal::UniversalPrint(const T& value, ostream*); +// +// // Prints the fields of a tuple tersely to a string vector, one +// // element for each field. Tuple support must be enabled in +// // gtest-port.h. +// std::vector UniversalTersePrintTupleFieldsToStrings( +// const Tuple& value); +// +// Known limitation: +// +// The print primitives print the elements of an STL-style container +// using the compiler-inferred type of *iter where iter is a +// const_iterator of the container. When const_iterator is an input +// iterator but not a forward iterator, this inferred type may not +// match value_type, and the print output may be incorrect. In +// practice, this is rarely a problem as for most containers +// const_iterator is a forward iterator. We'll fix this if there's an +// actual need for it. Note that this fix cannot rely on value_type +// being defined as many user-defined container types don't have +// value_type. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#include // NOLINT +#include +#include +#include +#include +#include "gtest/internal/gtest-port.h" +#include "gtest/internal/gtest-internal.h" + +#if GTEST_HAS_STD_TUPLE_ +# include +#endif + +namespace testing { + +// Definitions in the 'internal' and 'internal2' name spaces are +// subject to change without notice. DO NOT USE THEM IN USER CODE! +namespace internal2 { + +// Prints the given number of bytes in the given object to the given +// ostream. +GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, + size_t count, + ::std::ostream* os); + +// For selecting which printer to use when a given type has neither << +// nor PrintTo(). +enum TypeKind { + kProtobuf, // a protobuf type + kConvertibleToInteger, // a type implicitly convertible to BiggestInt + // (e.g. a named or unnamed enum type) + kOtherType // anything else +}; + +// TypeWithoutFormatter::PrintValue(value, os) is called +// by the universal printer to print a value of type T when neither +// operator<< nor PrintTo() is defined for T, where kTypeKind is the +// "kind" of T as defined by enum TypeKind. +template +class TypeWithoutFormatter { + public: + // This default version is called when kTypeKind is kOtherType. + static void PrintValue(const T& value, ::std::ostream* os) { + PrintBytesInObjectTo(reinterpret_cast(&value), + sizeof(value), os); + } +}; + +// We print a protobuf using its ShortDebugString() when the string +// doesn't exceed this many characters; otherwise we print it using +// DebugString() for better readability. +const size_t kProtobufOneLinerMaxLength = 50; + +template +class TypeWithoutFormatter { + public: + static void PrintValue(const T& value, ::std::ostream* os) { + const ::testing::internal::string short_str = value.ShortDebugString(); + const ::testing::internal::string pretty_str = + short_str.length() <= kProtobufOneLinerMaxLength ? + short_str : ("\n" + value.DebugString()); + *os << ("<" + pretty_str + ">"); + } +}; + +template +class TypeWithoutFormatter { + public: + // Since T has no << operator or PrintTo() but can be implicitly + // converted to BiggestInt, we print it as a BiggestInt. + // + // Most likely T is an enum type (either named or unnamed), in which + // case printing it as an integer is the desired behavior. In case + // T is not an enum, printing it as an integer is the best we can do + // given that it has no user-defined printer. + static void PrintValue(const T& value, ::std::ostream* os) { + const internal::BiggestInt kBigInt = value; + *os << kBigInt; + } +}; + +// Prints the given value to the given ostream. If the value is a +// protocol message, its debug string is printed; if it's an enum or +// of a type implicitly convertible to BiggestInt, it's printed as an +// integer; otherwise the bytes in the value are printed. This is +// what UniversalPrinter::Print() does when it knows nothing about +// type T and T has neither << operator nor PrintTo(). +// +// A user can override this behavior for a class type Foo by defining +// a << operator in the namespace where Foo is defined. +// +// We put this operator in namespace 'internal2' instead of 'internal' +// to simplify the implementation, as much code in 'internal' needs to +// use << in STL, which would conflict with our own << were it defined +// in 'internal'. +// +// Note that this operator<< takes a generic std::basic_ostream type instead of the more restricted std::ostream. If +// we define it to take an std::ostream instead, we'll get an +// "ambiguous overloads" compiler error when trying to print a type +// Foo that supports streaming to std::basic_ostream, as the compiler cannot tell whether +// operator<<(std::ostream&, const T&) or +// operator<<(std::basic_stream, const Foo&) is more +// specific. +template +::std::basic_ostream& operator<<( + ::std::basic_ostream& os, const T& x) { + TypeWithoutFormatter::value ? kProtobuf : + internal::ImplicitlyConvertible::value ? + kConvertibleToInteger : kOtherType)>::PrintValue(x, &os); + return os; +} + +} // namespace internal2 +} // namespace testing + +// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up +// magic needed for implementing UniversalPrinter won't work. +namespace testing_internal { + +// Used to print a value that is not an STL-style container when the +// user doesn't define PrintTo() for it. +template +void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) { + // With the following statement, during unqualified name lookup, + // testing::internal2::operator<< appears as if it was declared in + // the nearest enclosing namespace that contains both + // ::testing_internal and ::testing::internal2, i.e. the global + // namespace. For more details, refer to the C++ Standard section + // 7.3.4-1 [namespace.udir]. This allows us to fall back onto + // testing::internal2::operator<< in case T doesn't come with a << + // operator. + // + // We cannot write 'using ::testing::internal2::operator<<;', which + // gcc 3.3 fails to compile due to a compiler bug. + using namespace ::testing::internal2; // NOLINT + + // Assuming T is defined in namespace foo, in the next statement, + // the compiler will consider all of: + // + // 1. foo::operator<< (thanks to Koenig look-up), + // 2. ::operator<< (as the current namespace is enclosed in ::), + // 3. testing::internal2::operator<< (thanks to the using statement above). + // + // The operator<< whose type matches T best will be picked. + // + // We deliberately allow #2 to be a candidate, as sometimes it's + // impossible to define #1 (e.g. when foo is ::std, defining + // anything in it is undefined behavior unless you are a compiler + // vendor.). + *os << value; +} + +} // namespace testing_internal + +namespace testing { +namespace internal { + +// FormatForComparison::Format(value) formats a +// value of type ToPrint that is an operand of a comparison assertion +// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in +// the comparison, and is used to help determine the best way to +// format the value. In particular, when the value is a C string +// (char pointer) and the other operand is an STL string object, we +// want to format the C string as a string, since we know it is +// compared by value with the string object. If the value is a char +// pointer but the other operand is not an STL string object, we don't +// know whether the pointer is supposed to point to a NUL-terminated +// string, and thus want to print it as a pointer to be safe. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// The default case. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint& value) { + return ::testing::PrintToString(value); + } +}; + +// Array. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint* value) { + return FormatForComparison::Format(value); + } +}; + +// By default, print C string as pointers to be safe, as we don't know +// whether they actually point to a NUL-terminated string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \ + template \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(static_cast(value)); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ + +// If a C string is compared with an STL string object, we know it's meant +// to point to a NUL-terminated string, and thus can print it as a string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ + template <> \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(value); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); + +#if GTEST_HAS_GLOBAL_STRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string); +#endif + +#if GTEST_HAS_GLOBAL_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring); +#endif + +#if GTEST_HAS_STD_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); +#endif + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_ + +// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) +// operand to be used in a failure message. The type (but not value) +// of the other operand may affect the format. This allows us to +// print a char* as a raw pointer when it is compared against another +// char* or void*, and print it as a C string when it is compared +// against an std::string object, for example. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +std::string FormatForComparisonFailureMessage( + const T1& value, const T2& /* other_operand */) { + return FormatForComparison::Format(value); +} + +// UniversalPrinter::Print(value, ostream_ptr) prints the given +// value to the given ostream. The caller must ensure that +// 'ostream_ptr' is not NULL, or the behavior is undefined. +// +// We define UniversalPrinter as a class template (as opposed to a +// function template), as we need to partially specialize it for +// reference types, which cannot be done with function templates. +template +class UniversalPrinter; + +template +void UniversalPrint(const T& value, ::std::ostream* os); + +// Used to print an STL-style container when the user doesn't define +// a PrintTo() for it. +template +void DefaultPrintTo(IsContainer /* dummy */, + false_type /* is not a pointer */, + const C& container, ::std::ostream* os) { + const size_t kMaxCount = 32; // The maximum number of elements to print. + *os << '{'; + size_t count = 0; + for (typename C::const_iterator it = container.begin(); + it != container.end(); ++it, ++count) { + if (count > 0) { + *os << ','; + if (count == kMaxCount) { // Enough has been printed. + *os << " ..."; + break; + } + } + *os << ' '; + // We cannot call PrintTo(*it, os) here as PrintTo() doesn't + // handle *it being a native array. + internal::UniversalPrint(*it, os); + } + + if (count > 0) { + *os << ' '; + } + *os << '}'; +} + +// Used to print a pointer that is neither a char pointer nor a member +// pointer, when the user doesn't define PrintTo() for it. (A member +// variable pointer or member function pointer doesn't really point to +// a location in the address space. Their representation is +// implementation-defined. Therefore they will be printed as raw +// bytes.) +template +void DefaultPrintTo(IsNotContainer /* dummy */, + true_type /* is a pointer */, + T* p, ::std::ostream* os) { + if (p == NULL) { + *os << "NULL"; + } else { + // C++ doesn't allow casting from a function pointer to any object + // pointer. + // + // IsTrue() silences warnings: "Condition is always true", + // "unreachable code". + if (IsTrue(ImplicitlyConvertible::value)) { + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. However, we cannot cast it to const void* directly, + // even using reinterpret_cast, as earlier versions of gcc + // (e.g. 3.4.5) cannot compile the cast when p is a function + // pointer. Casting to UInt64 first solves the problem. + *os << reinterpret_cast( + reinterpret_cast(p)); + } + } +} + +// Used to print a non-container, non-pointer value when the user +// doesn't define PrintTo() for it. +template +void DefaultPrintTo(IsNotContainer /* dummy */, + false_type /* is not a pointer */, + const T& value, ::std::ostream* os) { + ::testing_internal::DefaultPrintNonContainerTo(value, os); +} + +// Prints the given value using the << operator if it has one; +// otherwise prints the bytes in it. This is what +// UniversalPrinter::Print() does when PrintTo() is not specialized +// or overloaded for type T. +// +// A user can override this behavior for a class type Foo by defining +// an overload of PrintTo() in the namespace where Foo is defined. We +// give the user this option as sometimes defining a << operator for +// Foo is not desirable (e.g. the coding style may prevent doing it, +// or there is already a << operator but it doesn't do what the user +// wants). +template +void PrintTo(const T& value, ::std::ostream* os) { + // DefaultPrintTo() is overloaded. The type of its first two + // arguments determine which version will be picked. If T is an + // STL-style container, the version for container will be called; if + // T is a pointer, the pointer version will be called; otherwise the + // generic version will be called. + // + // Note that we check for container types here, prior to we check + // for protocol message types in our operator<<. The rationale is: + // + // For protocol messages, we want to give people a chance to + // override Google Mock's format by defining a PrintTo() or + // operator<<. For STL containers, other formats can be + // incompatible with Google Mock's format for the container + // elements; therefore we check for container types here to ensure + // that our format is used. + // + // The second argument of DefaultPrintTo() is needed to bypass a bug + // in Symbian's C++ compiler that prevents it from picking the right + // overload between: + // + // PrintTo(const T& x, ...); + // PrintTo(T* x, ...); + DefaultPrintTo(IsContainerTest(0), is_pointer(), value, os); +} + +// The following list of PrintTo() overloads tells +// UniversalPrinter::Print() how to print standard types (built-in +// types, strings, plain arrays, and pointers). + +// Overloads for various char types. +GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os); +GTEST_API_ void PrintTo(signed char c, ::std::ostream* os); +inline void PrintTo(char c, ::std::ostream* os) { + // When printing a plain char, we always treat it as unsigned. This + // way, the output won't be affected by whether the compiler thinks + // char is signed or not. + PrintTo(static_cast(c), os); +} + +// Overloads for other simple built-in types. +inline void PrintTo(bool x, ::std::ostream* os) { + *os << (x ? "true" : "false"); +} + +// Overload for wchar_t type. +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its decimal code (except for L'\0'). +// The L'\0' char is printed as "L'\\0'". The decimal code is printed +// as signed integer when wchar_t is implemented by the compiler +// as a signed type and is printed as an unsigned integer when wchar_t +// is implemented as an unsigned type. +GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); + +// Overloads for C strings. +GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); +inline void PrintTo(char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// signed/unsigned char is often used for representing binary data, so +// we print pointers to it as void* to be safe. +inline void PrintTo(const signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(const unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// MSVC can be configured to define wchar_t as a typedef of unsigned +// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native +// type. When wchar_t is a typedef, defining an overload for const +// wchar_t* would cause unsigned short* be printed as a wide string, +// possibly causing invalid memory accesses. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Overloads for wide C strings +GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os); +inline void PrintTo(wchar_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#endif + +// Overload for C arrays. Multi-dimensional arrays are printed +// properly. + +// Prints the given number of elements in an array, without printing +// the curly braces. +template +void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { + UniversalPrint(a[0], os); + for (size_t i = 1; i != count; i++) { + *os << ", "; + UniversalPrint(a[i], os); + } +} + +// Overloads for ::string and ::std::string. +#if GTEST_HAS_GLOBAL_STRING +GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os); +inline void PrintTo(const ::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} +#endif // GTEST_HAS_GLOBAL_STRING + +GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); +inline void PrintTo(const ::std::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} + +// Overloads for ::wstring and ::std::wstring. +#if GTEST_HAS_GLOBAL_WSTRING +GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os); +inline void PrintTo(const ::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); +inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template +void PrintTupleTo(const T& t, ::std::ostream* os); +#endif // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ + +#if GTEST_HAS_TR1_TUPLE +// Overload for ::std::tr1::tuple. Needed for printing function arguments, +// which are packed as tuples. + +// Overloaded PrintTo() for tuples of various arities. We support +// tuples of up-to 10 fields. The following implementation works +// regardless of whether tr1::tuple is implemented using the +// non-standard variadic template feature or not. + +inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo( + const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} +#endif // GTEST_HAS_TR1_TUPLE + +#if GTEST_HAS_STD_TUPLE_ +template +void PrintTo(const ::std::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} +#endif // GTEST_HAS_STD_TUPLE_ + +// Overload for std::pair. +template +void PrintTo(const ::std::pair& value, ::std::ostream* os) { + *os << '('; + // We cannot use UniversalPrint(value.first, os) here, as T1 may be + // a reference type. The same for printing value.second. + UniversalPrinter::Print(value.first, os); + *os << ", "; + UniversalPrinter::Print(value.second, os); + *os << ')'; +} + +// Implements printing a non-reference type T by letting the compiler +// pick the right overload of PrintTo() for T. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + // Note: we deliberately don't call this PrintTo(), as that name + // conflicts with ::testing::internal::PrintTo in the body of the + // function. + static void Print(const T& value, ::std::ostream* os) { + // By default, ::testing::internal::PrintTo() is used for printing + // the value. + // + // Thanks to Koenig look-up, if T is a class and has its own + // PrintTo() function defined in its namespace, that function will + // be visible here. Since it is more specific than the generic ones + // in ::testing::internal, it will be picked by the compiler in the + // following statement - exactly what we want. + PrintTo(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +// UniversalPrintArray(begin, len, os) prints an array of 'len' +// elements, starting at address 'begin'. +template +void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { + if (len == 0) { + *os << "{}"; + } else { + *os << "{ "; + const size_t kThreshold = 18; + const size_t kChunkSize = 8; + // If the array has more than kThreshold elements, we'll have to + // omit some details by printing only the first and the last + // kChunkSize elements. + // TODO(wan@google.com): let the user control the threshold using a flag. + if (len <= kThreshold) { + PrintRawArrayTo(begin, len, os); + } else { + PrintRawArrayTo(begin, kChunkSize, os); + *os << ", ..., "; + PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); + } + *os << " }"; + } +} +// This overload prints a (const) char array compactly. +GTEST_API_ void UniversalPrintArray( + const char* begin, size_t len, ::std::ostream* os); + +// This overload prints a (const) wchar_t array compactly. +GTEST_API_ void UniversalPrintArray( + const wchar_t* begin, size_t len, ::std::ostream* os); + +// Implements printing an array type T[N]. +template +class UniversalPrinter { + public: + // Prints the given array, omitting some elements when there are too + // many. + static void Print(const T (&a)[N], ::std::ostream* os) { + UniversalPrintArray(a, N, os); + } +}; + +// Implements printing a reference type T&. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + static void Print(const T& value, ::std::ostream* os) { + // Prints the address of the value. We use reinterpret_cast here + // as static_cast doesn't compile when T is a function type. + *os << "@" << reinterpret_cast(&value) << " "; + + // Then prints the value itself. + UniversalPrint(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +// Prints a value tersely: for a reference type, the referenced value +// (but not the address) is printed; for a (const) char pointer, the +// NUL-terminated string (but not the pointer) is printed. + +template +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T (&value)[N], ::std::ostream* os) { + UniversalPrinter::Print(value, os); + } +}; +template <> +class UniversalTersePrinter { + public: + static void Print(const char* str, ::std::ostream* os) { + if (str == NULL) { + *os << "NULL"; + } else { + UniversalPrint(string(str), os); + } + } +}; +template <> +class UniversalTersePrinter { + public: + static void Print(char* str, ::std::ostream* os) { + UniversalTersePrinter::Print(str, os); + } +}; + +#if GTEST_HAS_STD_WSTRING +template <> +class UniversalTersePrinter { + public: + static void Print(const wchar_t* str, ::std::ostream* os) { + if (str == NULL) { + *os << "NULL"; + } else { + UniversalPrint(::std::wstring(str), os); + } + } +}; +#endif + +template <> +class UniversalTersePrinter { + public: + static void Print(wchar_t* str, ::std::ostream* os) { + UniversalTersePrinter::Print(str, os); + } +}; + +template +void UniversalTersePrint(const T& value, ::std::ostream* os) { + UniversalTersePrinter::Print(value, os); +} + +// Prints a value using the type inferred by the compiler. The +// difference between this and UniversalTersePrint() is that for a +// (const) char pointer, this prints both the pointer and the +// NUL-terminated string. +template +void UniversalPrint(const T& value, ::std::ostream* os) { + // A workarond for the bug in VC++ 7.1 that prevents us from instantiating + // UniversalPrinter with T directly. + typedef T T1; + UniversalPrinter::Print(value, os); +} + +typedef ::std::vector Strings; + +// TuplePolicy must provide: +// - tuple_size +// size of tuple TupleT. +// - get(const TupleT& t) +// static function extracting element I of tuple TupleT. +// - tuple_element::type +// type of element I of tuple TupleT. +template +struct TuplePolicy; + +#if GTEST_HAS_TR1_TUPLE +template +struct TuplePolicy { + typedef TupleT Tuple; + static const size_t tuple_size = ::std::tr1::tuple_size::value; + + template + struct tuple_element : ::std::tr1::tuple_element {}; + + template + static typename AddReference< + const typename ::std::tr1::tuple_element::type>::type get( + const Tuple& tuple) { + return ::std::tr1::get(tuple); + } +}; +template +const size_t TuplePolicy::tuple_size; +#endif // GTEST_HAS_TR1_TUPLE + +#if GTEST_HAS_STD_TUPLE_ +template +struct TuplePolicy< ::std::tuple > { + typedef ::std::tuple Tuple; + static const size_t tuple_size = ::std::tuple_size::value; + + template + struct tuple_element : ::std::tuple_element {}; + + template + static const typename ::std::tuple_element::type& get( + const Tuple& tuple) { + return ::std::get(tuple); + } +}; +template +const size_t TuplePolicy< ::std::tuple >::tuple_size; +#endif // GTEST_HAS_STD_TUPLE_ + +#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ +// This helper template allows PrintTo() for tuples and +// UniversalTersePrintTupleFieldsToStrings() to be defined by +// induction on the number of tuple fields. The idea is that +// TuplePrefixPrinter::PrintPrefixTo(t, os) prints the first N +// fields in tuple t, and can be defined in terms of +// TuplePrefixPrinter. +// +// The inductive case. +template +struct TuplePrefixPrinter { + // Prints the first N fields of a tuple. + template + static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { + TuplePrefixPrinter::PrintPrefixTo(t, os); + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (N > 1) { + GTEST_INTENTIONAL_CONST_COND_POP_() + *os << ", "; + } + UniversalPrinter< + typename TuplePolicy::template tuple_element::type> + ::Print(TuplePolicy::template get(t), os); + } + + // Tersely prints the first N fields of a tuple to a string vector, + // one element for each field. + template + static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { + TuplePrefixPrinter::TersePrintPrefixToStrings(t, strings); + ::std::stringstream ss; + UniversalTersePrint(TuplePolicy::template get(t), &ss); + strings->push_back(ss.str()); + } +}; + +// Base case. +template <> +struct TuplePrefixPrinter<0> { + template + static void PrintPrefixTo(const Tuple&, ::std::ostream*) {} + + template + static void TersePrintPrefixToStrings(const Tuple&, Strings*) {} +}; + +// Helper function for printing a tuple. +// Tuple must be either std::tr1::tuple or std::tuple type. +template +void PrintTupleTo(const Tuple& t, ::std::ostream* os) { + *os << "("; + TuplePrefixPrinter::tuple_size>::PrintPrefixTo(t, os); + *os << ")"; +} + +// Prints the fields of a tuple tersely to a string vector, one +// element for each field. See the comment before +// UniversalTersePrint() for how we define "tersely". +template +Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { + Strings result; + TuplePrefixPrinter::tuple_size>:: + TersePrintPrefixToStrings(value, &result); + return result; +} +#endif // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_ + +} // namespace internal + +template +::std::string PrintToString(const T& value) { + ::std::stringstream ss; + internal::UniversalTersePrinter::Print(value, &ss); + return ss.str(); +} + +} // namespace testing + +// Include any custom printer added by the local installation. +// We must include this header at the end to make sure it can use the +// declarations from this file. +#include "gtest/internal/custom/gtest-printers.h" + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h new file mode 100644 index 0000000000..f63fa9a1b2 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h @@ -0,0 +1,232 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// Utilities for testing Google Test itself and code that uses Google Test +// (e.g. frameworks built on top of Google Test). + +#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ +#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ + +#include "gtest/gtest.h" + +namespace testing { + +// This helper class can be used to mock out Google Test failure reporting +// so that we can test Google Test or code that builds on Google Test. +// +// An object of this class appends a TestPartResult object to the +// TestPartResultArray object given in the constructor whenever a Google Test +// failure is reported. It can either intercept only failures that are +// generated in the same thread that created this object or it can intercept +// all generated failures. The scope of this mock object can be controlled with +// the second argument to the two arguments constructor. +class GTEST_API_ ScopedFakeTestPartResultReporter + : public TestPartResultReporterInterface { + public: + // The two possible mocking modes of this object. + enum InterceptMode { + INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures. + INTERCEPT_ALL_THREADS // Intercepts all failures. + }; + + // The c'tor sets this object as the test part result reporter used + // by Google Test. The 'result' parameter specifies where to report the + // results. This reporter will only catch failures generated in the current + // thread. DEPRECATED + explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result); + + // Same as above, but you can choose the interception scope of this object. + ScopedFakeTestPartResultReporter(InterceptMode intercept_mode, + TestPartResultArray* result); + + // The d'tor restores the previous test part result reporter. + virtual ~ScopedFakeTestPartResultReporter(); + + // Appends the TestPartResult object to the TestPartResultArray + // received in the constructor. + // + // This method is from the TestPartResultReporterInterface + // interface. + virtual void ReportTestPartResult(const TestPartResult& result); + private: + void Init(); + + const InterceptMode intercept_mode_; + TestPartResultReporterInterface* old_reporter_; + TestPartResultArray* const result_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter); +}; + +namespace internal { + +// A helper class for implementing EXPECT_FATAL_FAILURE() and +// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +class GTEST_API_ SingleFailureChecker { + public: + // The constructor remembers the arguments. + SingleFailureChecker(const TestPartResultArray* results, + TestPartResult::Type type, + const string& substr); + ~SingleFailureChecker(); + private: + const TestPartResultArray* const results_; + const TestPartResult::Type type_; + const string substr_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); +}; + +} // namespace internal + +} // namespace testing + +// A set of macros for testing Google Test assertions or code that's expected +// to generate Google Test fatal failures. It verifies that the given +// statement will cause exactly one fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_FATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - 'statement' cannot reference local non-static variables or +// non-static members of the current object. +// - 'statement' cannot return a value. +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. The AcceptsMacroThatExpandsToUnprotectedComma test in +// gtest_unittest.cc will fail to compile if we do that. +#define EXPECT_FATAL_FAILURE(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper {\ + public:\ + static void Execute() { statement; }\ + };\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ + GTestExpectFatalFailureHelper::Execute();\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper {\ + public:\ + static void Execute() { statement; }\ + };\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ALL_THREADS, >est_failures);\ + GTestExpectFatalFailureHelper::Execute();\ + }\ + } while (::testing::internal::AlwaysFalse()) + +// A macro for testing Google Test assertions or code that's expected to +// generate Google Test non-fatal failures. It asserts that the given +// statement will cause exactly one non-fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// 'statement' is allowed to reference local variables and members of +// the current object. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. If we do that, the code won't compile when the user gives +// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that +// expands to code containing an unprotected comma. The +// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc +// catches that. +// +// For the same reason, we have to write +// if (::testing::internal::AlwaysTrue()) { statement; } +// instead of +// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) +// to avoid an MSVC warning on unreachable code. +#define EXPECT_NONFATAL_FAILURE(statement, substr) \ + do {\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ + if (::testing::internal::AlwaysTrue()) { statement; }\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do {\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures);\ + if (::testing::internal::AlwaysTrue()) { statement; }\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#endif // GTEST_INCLUDE_GTEST_GTEST_SPI_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h new file mode 100644 index 0000000000..77eb844839 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h @@ -0,0 +1,179 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: mheule@google.com (Markus Heule) +// + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ + +#include +#include +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" + +namespace testing { + +// A copyable object representing the result of a test part (i.e. an +// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). +// +// Don't inherit from TestPartResult as its destructor is not virtual. +class GTEST_API_ TestPartResult { + public: + // The possible outcomes of a test part (i.e. an assertion or an + // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). + enum Type { + kSuccess, // Succeeded. + kNonFatalFailure, // Failed but the test can continue. + kFatalFailure // Failed and the test should be terminated. + }; + + // C'tor. TestPartResult does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestPartResult object. + TestPartResult(Type a_type, + const char* a_file_name, + int a_line_number, + const char* a_message) + : type_(a_type), + file_name_(a_file_name == NULL ? "" : a_file_name), + line_number_(a_line_number), + summary_(ExtractSummary(a_message)), + message_(a_message) { + } + + // Gets the outcome of the test part. + Type type() const { return type_; } + + // Gets the name of the source file where the test part took place, or + // NULL if it's unknown. + const char* file_name() const { + return file_name_.empty() ? NULL : file_name_.c_str(); + } + + // Gets the line in the source file where the test part took place, + // or -1 if it's unknown. + int line_number() const { return line_number_; } + + // Gets the summary of the failure message. + const char* summary() const { return summary_.c_str(); } + + // Gets the message associated with the test part. + const char* message() const { return message_.c_str(); } + + // Returns true iff the test part passed. + bool passed() const { return type_ == kSuccess; } + + // Returns true iff the test part failed. + bool failed() const { return type_ != kSuccess; } + + // Returns true iff the test part non-fatally failed. + bool nonfatally_failed() const { return type_ == kNonFatalFailure; } + + // Returns true iff the test part fatally failed. + bool fatally_failed() const { return type_ == kFatalFailure; } + + private: + Type type_; + + // Gets the summary of the failure message by omitting the stack + // trace in it. + static std::string ExtractSummary(const char* message); + + // The name of the source file where the test part took place, or + // "" if the source file is unknown. + std::string file_name_; + // The line in the source file where the test part took place, or -1 + // if the line number is unknown. + int line_number_; + std::string summary_; // The test failure summary. + std::string message_; // The test failure message. +}; + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result); + +// An array of TestPartResult objects. +// +// Don't inherit from TestPartResultArray as its destructor is not +// virtual. +class GTEST_API_ TestPartResultArray { + public: + TestPartResultArray() {} + + // Appends the given TestPartResult to the array. + void Append(const TestPartResult& result); + + // Returns the TestPartResult at the given index (0-based). + const TestPartResult& GetTestPartResult(int index) const; + + // Returns the number of TestPartResult objects in the array. + int size() const; + + private: + std::vector array_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); +}; + +// This interface knows how to report a test part result. +class TestPartResultReporterInterface { + public: + virtual ~TestPartResultReporterInterface() {} + + virtual void ReportTestPartResult(const TestPartResult& result) = 0; +}; + +namespace internal { + +// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a +// statement generates new fatal failures. To do so it registers itself as the +// current test part result reporter. Besides checking if fatal failures were +// reported, it only delegates the reporting to the former result reporter. +// The original result reporter is restored in the destructor. +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +class GTEST_API_ HasNewFatalFailureHelper + : public TestPartResultReporterInterface { + public: + HasNewFatalFailureHelper(); + virtual ~HasNewFatalFailureHelper(); + virtual void ReportTestPartResult(const TestPartResult& result); + bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + private: + bool has_new_fatal_failure_; + TestPartResultReporterInterface* original_reporter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); +}; + +} // namespace internal + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h new file mode 100644 index 0000000000..5f69d5678e --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -0,0 +1,263 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// This header implements typed tests and type-parameterized tests. + +// Typed (aka type-driven) tests repeat the same test for types in a +// list. You must know which types you want to test with when writing +// typed tests. Here's how you do it: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + public: + ... + typedef std::list List; + static T shared_; + T value_; +}; + +// Next, associate a list of types with the test case, which will be +// repeated for each type in the list. The typedef is necessary for +// the macro to parse correctly. +typedef testing::Types MyTypes; +TYPED_TEST_CASE(FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// TYPED_TEST_CASE(FooTest, int); + +// Then, use TYPED_TEST() instead of TEST_F() to define as many typed +// tests for this test case as you want. +TYPED_TEST(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + // Since we are inside a derived class template, C++ requires use to + // visit the members of FooTest via 'this'. + TypeParam n = this->value_; + + // To visit static members of the fixture, add the TestFixture:: + // prefix. + n += TestFixture::shared_; + + // To refer to typedefs in the fixture, add the "typename + // TestFixture::" prefix. + typename TestFixture::List values; + values.push_back(n); + ... +} + +TYPED_TEST(FooTest, HasPropertyA) { ... } + +#endif // 0 + +// Type-parameterized tests are abstract test patterns parameterized +// by a type. Compared with typed tests, type-parameterized tests +// allow you to define the test pattern without knowing what the type +// parameters are. The defined pattern can be instantiated with +// different types any number of times, in any number of translation +// units. +// +// If you are designing an interface or concept, you can define a +// suite of type-parameterized tests to verify properties that any +// valid implementation of the interface/concept should have. Then, +// each implementation can easily instantiate the test suite to verify +// that it conforms to the requirements, without having to write +// similar tests repeatedly. Here's an example: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + ... +}; + +// Next, declare that you will define a type-parameterized test case +// (the _P suffix is for "parameterized" or "pattern", whichever you +// prefer): +TYPED_TEST_CASE_P(FooTest); + +// Then, use TYPED_TEST_P() to define as many type-parameterized tests +// for this type-parameterized test case as you want. +TYPED_TEST_P(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + TypeParam n = 0; + ... +} + +TYPED_TEST_P(FooTest, HasPropertyA) { ... } + +// Now the tricky part: you need to register all test patterns before +// you can instantiate them. The first argument of the macro is the +// test case name; the rest are the names of the tests in this test +// case. +REGISTER_TYPED_TEST_CASE_P(FooTest, + DoesBlah, HasPropertyA); + +// Finally, you are free to instantiate the pattern with the types you +// want. If you put the above code in a header file, you can #include +// it in multiple C++ source files and instantiate it multiple times. +// +// To distinguish different instances of the pattern, the first +// argument to the INSTANTIATE_* macro is a prefix that will be added +// to the actual test case name. Remember to pick unique prefixes for +// different instances. +typedef testing::Types MyTypes; +INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int); + +#endif // 0 + +#include "gtest/internal/gtest-port.h" +#include "gtest/internal/gtest-type-util.h" + +// Implements typed tests. + +#if GTEST_HAS_TYPED_TEST + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the typedef for the type parameters of the +// given test case. +# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_ + +// The 'Types' template argument below must have spaces around it +// since some compilers may choke on '>>' when passing a template +// instance (e.g. Types) +# define TYPED_TEST_CASE(CaseName, Types) \ + typedef ::testing::internal::TypeList< Types >::type \ + GTEST_TYPE_PARAMS_(CaseName) + +# define TYPED_TEST(CaseName, TestName) \ + template \ + class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ + : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTest< \ + CaseName, \ + ::testing::internal::TemplateSel< \ + GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \ + GTEST_TYPE_PARAMS_(CaseName)>::Register(\ + "", ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + #CaseName, #TestName, 0); \ + template \ + void GTEST_TEST_CLASS_NAME_(CaseName, TestName)::TestBody() + +#endif // GTEST_HAS_TYPED_TEST + +// Implements type-parameterized tests. + +#if GTEST_HAS_TYPED_TEST_P + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the namespace name that the type-parameterized tests for +// the given type-parameterized test case are defined in. The exact +// name of the namespace is subject to change without notice. +# define GTEST_CASE_NAMESPACE_(TestCaseName) \ + gtest_case_##TestCaseName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the variable used to remember the names of +// the defined tests in the given test case. +# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \ + gtest_typed_test_case_p_state_##TestCaseName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. +// +// Expands to the name of the variable used to remember the names of +// the registered tests in the given test case. +# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \ + gtest_registered_test_names_##TestCaseName##_ + +// The variables defined in the type-parameterized test macros are +// static as typically these macros are used in a .h file that can be +// #included in multiple translation units linked together. +# define TYPED_TEST_CASE_P(CaseName) \ + static ::testing::internal::TypedTestCasePState \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName) + +# define TYPED_TEST_P(CaseName, TestName) \ + namespace GTEST_CASE_NAMESPACE_(CaseName) { \ + template \ + class TestName : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\ + __FILE__, __LINE__, #CaseName, #TestName); \ + } \ + template \ + void GTEST_CASE_NAMESPACE_(CaseName)::TestName::TestBody() + +# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \ + namespace GTEST_CASE_NAMESPACE_(CaseName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ + } \ + static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\ + __FILE__, __LINE__, #__VA_ARGS__) + +// The 'Types' template argument below must have spaces around it +// since some compilers may choke on '>>' when passing a template +// instance (e.g. Types) +# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \ + bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestCase::type>::Register(\ + #Prefix, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_CASE_P_STATE_(CaseName), \ + #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName)) + +#endif // GTEST_HAS_TYPED_TEST_P + +#endif // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h index 581a44e95f..f846c5bd66 100644 --- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h @@ -55,17442 +55,29 @@ #include #include -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file declares functions and macros used internally by -// Google Test. They are subject to change without notice. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ - -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: wan@google.com (Zhanyong Wan) -// -// Low-level types and utilities for porting Google Test to various -// platforms. They are subject to change without notice. DO NOT USE -// THEM IN USER CODE. -// -// This file is fundamental to Google Test. All other Google Test source -// files are expected to #include this. Therefore, it cannot #include -// any other Google Test header. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ - -// The user can define the following macros in the build script to -// control Google Test's behavior. If the user doesn't define a macro -// in this list, Google Test will define it. -// -// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) -// is/isn't available. -// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions -// are enabled. -// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string -// is/isn't available (some systems define -// ::string, which is different to std::string). -// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string -// is/isn't available (some systems define -// ::wstring, which is different to std::wstring). -// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular -// expressions are/aren't available. -// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that -// is/isn't available. -// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't -// enabled. -// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that -// std::wstring does/doesn't work (Google Test can -// be used where std::wstring is unavailable). -// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple -// is/isn't available. -// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the -// compiler supports Microsoft's "Structured -// Exception Handling". -// GTEST_HAS_STREAM_REDIRECTION -// - Define it to 1/0 to indicate whether the -// platform supports I/O stream redirection using -// dup() and dup2(). -// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google -// Test's own tr1 tuple implementation should be -// used. Unused when the user sets -// GTEST_HAS_TR1_TUPLE to 0. -// GTEST_LANG_CXX11 - Define it to 1/0 to indicate that Google Test -// is building in C++11/C++98 mode. -// GTEST_LINKED_AS_SHARED_LIBRARY -// - Define to 1 when compiling tests that use -// Google Test as a shared library (known as -// DLL on Windows). -// GTEST_CREATE_SHARED_LIBRARY -// - Define to 1 when compiling Google Test itself -// as a shared library. - -// This header defines the following utilities: -// -// Macros indicating the current platform (defined to 1 if compiled on -// the given platform; otherwise undefined): -// GTEST_OS_AIX - IBM AIX -// GTEST_OS_CYGWIN - Cygwin -// GTEST_OS_HPUX - HP-UX -// GTEST_OS_LINUX - Linux -// GTEST_OS_LINUX_ANDROID - Google Android -// GTEST_OS_MAC - Mac OS X -// GTEST_OS_IOS - iOS -// GTEST_OS_IOS_SIMULATOR - iOS simulator -// GTEST_OS_NACL - Google Native Client (NaCl) -// GTEST_OS_OPENBSD - OpenBSD -// GTEST_OS_QNX - QNX -// GTEST_OS_SOLARIS - Sun Solaris -// GTEST_OS_SYMBIAN - Symbian -// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) -// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop -// GTEST_OS_WINDOWS_MINGW - MinGW -// GTEST_OS_WINDOWS_MOBILE - Windows Mobile -// GTEST_OS_ZOS - z/OS -// -// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the -// most stable support. Since core members of the Google Test project -// don't have access to other platforms, support for them may be less -// stable. If you notice any problems on your platform, please notify -// googletestframework@googlegroups.com (patches for fixing them are -// even more welcome!). -// -// Note that it is possible that none of the GTEST_OS_* macros are defined. -// -// Macros indicating available Google Test features (defined to 1 if -// the corresponding feature is supported; otherwise undefined): -// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized -// tests) -// GTEST_HAS_DEATH_TEST - death tests -// GTEST_HAS_PARAM_TEST - value-parameterized tests -// GTEST_HAS_TYPED_TEST - typed tests -// GTEST_HAS_TYPED_TEST_P - type-parameterized tests -// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with -// GTEST_HAS_POSIX_RE (see above) which users can -// define themselves. -// GTEST_USES_SIMPLE_RE - our own simple regex is used; -// the above two are mutually exclusive. -// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ(). -// -// Macros for basic C++ coding: -// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. -// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a -// variable don't have to be used. -// GTEST_DISALLOW_ASSIGN_ - disables operator=. -// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. -// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. -// -// Synchronization: -// Mutex, MutexLock, ThreadLocal, GetThreadCount() -// - synchronization primitives. -// GTEST_IS_THREADSAFE - defined to 1 to indicate that the above -// synchronization primitives have real implementations -// and Google Test is thread-safe; or 0 otherwise. -// -// Template meta programming: -// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only. -// IteratorTraits - partial implementation of std::iterator_traits, which -// is not available in libCstd when compiled with Sun C++. -// -// Smart pointers: -// scoped_ptr - as in TR2. -// -// Regular expressions: -// RE - a simple regular expression class using the POSIX -// Extended Regular Expression syntax on UNIX-like -// platforms, or a reduced regular exception syntax on -// other platforms, including Windows. -// -// Logging: -// GTEST_LOG_() - logs messages at the specified severity level. -// LogToStderr() - directs all log messages to stderr. -// FlushInfoLog() - flushes informational log messages. -// -// Stdout and stderr capturing: -// CaptureStdout() - starts capturing stdout. -// GetCapturedStdout() - stops capturing stdout and returns the captured -// string. -// CaptureStderr() - starts capturing stderr. -// GetCapturedStderr() - stops capturing stderr and returns the captured -// string. -// -// Integer types: -// TypeWithSize - maps an integer to a int type. -// Int32, UInt32, Int64, UInt64, TimeInMillis -// - integers of known sizes. -// BiggestInt - the biggest signed integer type. -// -// Command-line utilities: -// GTEST_FLAG() - references a flag. -// GTEST_DECLARE_*() - declares a flag. -// GTEST_DEFINE_*() - defines a flag. -// GetInjectableArgvs() - returns the command line as a vector of strings. -// -// Environment variable utilities: -// GetEnv() - gets the value of an environment variable. -// BoolFromGTestEnv() - parses a bool environment variable. -// Int32FromGTestEnv() - parses an Int32 environment variable. -// StringFromGTestEnv() - parses a string environment variable. - -#include // for isspace, etc -#include // for ptrdiff_t -#include -#include -#include -#ifndef _WIN32_WCE -# include -# include -#endif // !_WIN32_WCE - -#if defined __APPLE__ -# include -# include -#endif - -#include // NOLINT -#include // NOLINT -#include // NOLINT - -#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" -#define GTEST_FLAG_PREFIX_ "gtest_" -#define GTEST_FLAG_PREFIX_DASH_ "gtest-" -#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" -#define GTEST_NAME_ "Google Test" -#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/" - -// Determines the version of gcc that is used to compile this. -#ifdef __GNUC__ -// 40302 means version 4.3.2. -# define GTEST_GCC_VER_ \ - (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) -#endif // __GNUC__ - -// Determines the platform on which Google Test is compiled. -#ifdef __CYGWIN__ -# define GTEST_OS_CYGWIN 1 -#elif defined __SYMBIAN32__ -# define GTEST_OS_SYMBIAN 1 -#elif defined _WIN32 -# define GTEST_OS_WINDOWS 1 -# ifdef _WIN32_WCE -# define GTEST_OS_WINDOWS_MOBILE 1 -# elif defined(__MINGW__) || defined(__MINGW32__) -# define GTEST_OS_WINDOWS_MINGW 1 -# else -# define GTEST_OS_WINDOWS_DESKTOP 1 -# endif // _WIN32_WCE -#elif defined __APPLE__ -# define GTEST_OS_MAC 1 -# if TARGET_OS_IPHONE -# define GTEST_OS_IOS 1 -# if TARGET_IPHONE_SIMULATOR -# define GTEST_OS_IOS_SIMULATOR 1 -# endif -# endif -#elif defined __linux__ -# define GTEST_OS_LINUX 1 -# if defined __ANDROID__ -# define GTEST_OS_LINUX_ANDROID 1 -# endif -#elif defined __MVS__ -# define GTEST_OS_ZOS 1 -#elif defined(__sun) && defined(__SVR4) -# define GTEST_OS_SOLARIS 1 -#elif defined(_AIX) -# define GTEST_OS_AIX 1 -#elif defined(__hpux) -# define GTEST_OS_HPUX 1 -#elif defined __native_client__ -# define GTEST_OS_NACL 1 -#elif defined __OpenBSD__ -# define GTEST_OS_OPENBSD 1 -#elif defined __QNX__ -# define GTEST_OS_QNX 1 -#endif // __CYGWIN__ - -#ifndef GTEST_LANG_CXX11 -// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when -// -std={c,gnu}++{0x,11} is passed. The C++11 standard specifies a -// value for __cplusplus, and recent versions of clang, gcc, and -// probably other compilers set that too in C++11 mode. -# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L -// Compiling in at least C++11 mode. -# define GTEST_LANG_CXX11 1 -# else -# define GTEST_LANG_CXX11 0 -# endif -#endif - -// Brings in definitions for functions used in the testing::internal::posix -// namespace (read, write, close, chdir, isatty, stat). We do not currently -// use them on Windows Mobile. -#if !GTEST_OS_WINDOWS -// This assumes that non-Windows OSes provide unistd.h. For OSes where this -// is not the case, we need to include headers that provide the functions -// mentioned above. -# include -# include -#elif !GTEST_OS_WINDOWS_MOBILE -# include -# include -#endif - -#if GTEST_OS_LINUX_ANDROID -// Used to define __ANDROID_API__ matching the target NDK API level. -# include // NOLINT -#endif - -// Defines this to true iff Google Test can use POSIX regular expressions. -#ifndef GTEST_HAS_POSIX_RE -# if GTEST_OS_LINUX_ANDROID -// On Android, is only available starting with Gingerbread. -# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) -# else -# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS) -# endif -#endif - -#if GTEST_HAS_POSIX_RE - -// On some platforms, needs someone to define size_t, and -// won't compile otherwise. We can #include it here as we already -// included , which is guaranteed to define size_t through -// . -# include // NOLINT - -# define GTEST_USES_POSIX_RE 1 - -#elif GTEST_OS_WINDOWS - -// is not available on Windows. Use our own simple regex -// implementation instead. -# define GTEST_USES_SIMPLE_RE 1 - -#else - -// may not be available on this platform. Use our own -// simple regex implementation instead. -# define GTEST_USES_SIMPLE_RE 1 - -#endif // GTEST_HAS_POSIX_RE - -#ifndef GTEST_HAS_EXCEPTIONS -// The user didn't tell us whether exceptions are enabled, so we need -// to figure it out. -# if defined(_MSC_VER) || defined(__BORLANDC__) -// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS -// macro to enable exceptions, so we'll do the same. -// Assumes that exceptions are enabled by default. -# ifndef _HAS_EXCEPTIONS -# define _HAS_EXCEPTIONS 1 -# endif // _HAS_EXCEPTIONS -# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS -# elif defined(__GNUC__) && __EXCEPTIONS -// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__SUNPRO_CC) -// Sun Pro CC supports exceptions. However, there is no compile-time way of -// detecting whether they are enabled or not. Therefore, we assume that -// they are enabled unless the user tells us otherwise. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__IBMCPP__) && __EXCEPTIONS -// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__HP_aCC) -// Exception handling is in effect by default in HP aCC compiler. It has to -// be turned of by +noeh compiler option if desired. -# define GTEST_HAS_EXCEPTIONS 1 -# else -// For other compilers, we assume exceptions are disabled to be -// conservative. -# define GTEST_HAS_EXCEPTIONS 0 -# endif // defined(_MSC_VER) || defined(__BORLANDC__) -#endif // GTEST_HAS_EXCEPTIONS - -#if !defined(GTEST_HAS_STD_STRING) -// Even though we don't use this macro any longer, we keep it in case -// some clients still depend on it. -# define GTEST_HAS_STD_STRING 1 -#elif !GTEST_HAS_STD_STRING -// The user told us that ::std::string isn't available. -# error "Google Test cannot be used where ::std::string isn't available." -#endif // !defined(GTEST_HAS_STD_STRING) - -#ifndef GTEST_HAS_GLOBAL_STRING -// The user didn't tell us whether ::string is available, so we need -// to figure it out. - -# define GTEST_HAS_GLOBAL_STRING 0 - -#endif // GTEST_HAS_GLOBAL_STRING - -#ifndef GTEST_HAS_STD_WSTRING -// The user didn't tell us whether ::std::wstring is available, so we need -// to figure it out. -// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring -// is available. - -// Cygwin 1.7 and below doesn't support ::std::wstring. -// Solaris' libc++ doesn't support it either. Android has -// no support for it at least as recent as Froyo (2.2). -# define GTEST_HAS_STD_WSTRING \ - (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS)) - -#endif // GTEST_HAS_STD_WSTRING - -#ifndef GTEST_HAS_GLOBAL_WSTRING -// The user didn't tell us whether ::wstring is available, so we need -// to figure it out. -# define GTEST_HAS_GLOBAL_WSTRING \ - (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING) -#endif // GTEST_HAS_GLOBAL_WSTRING - -// Determines whether RTTI is available. -#ifndef GTEST_HAS_RTTI -// The user didn't tell us whether RTTI is enabled, so we need to -// figure it out. - -# ifdef _MSC_VER - -# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled. -# define GTEST_HAS_RTTI 1 -# else -# define GTEST_HAS_RTTI 0 -# endif - -// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled. -# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302) - -# ifdef __GXX_RTTI -// When building against STLport with the Android NDK and with -// -frtti -fno-exceptions, the build fails at link time with undefined -// references to __cxa_bad_typeid. Note sure if STL or toolchain bug, -// so disable RTTI when detected. -# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \ - !defined(__EXCEPTIONS) -# define GTEST_HAS_RTTI 0 -# else -# define GTEST_HAS_RTTI 1 -# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS -# else -# define GTEST_HAS_RTTI 0 -# endif // __GXX_RTTI - -// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends -// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the -// first version with C++ support. -# elif defined(__clang__) - -# define GTEST_HAS_RTTI __has_feature(cxx_rtti) - -// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if -// both the typeid and dynamic_cast features are present. -# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) - -# ifdef __RTTI_ALL__ -# define GTEST_HAS_RTTI 1 -# else -# define GTEST_HAS_RTTI 0 -# endif - -# else - -// For all other compilers, we assume RTTI is enabled. -# define GTEST_HAS_RTTI 1 - -# endif // _MSC_VER - -#endif // GTEST_HAS_RTTI - -// It's this header's responsibility to #include when RTTI -// is enabled. -#if GTEST_HAS_RTTI -# include -#endif - -// Determines whether Google Test can use the pthreads library. -#ifndef GTEST_HAS_PTHREAD -// The user didn't tell us explicitly, so we assume pthreads support is -// available on Linux and Mac. -// -// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 -// to your compiler flags. -# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \ - || GTEST_OS_QNX) -#endif // GTEST_HAS_PTHREAD - -#if GTEST_HAS_PTHREAD -// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is -// true. -# include // NOLINT - -// For timespec and nanosleep, used below. -# include // NOLINT -#endif - -// Determines whether Google Test can use tr1/tuple. You can define -// this macro to 0 to prevent Google Test from using tuple (any -// feature depending on tuple with be disabled in this mode). -#ifndef GTEST_HAS_TR1_TUPLE -# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) -// STLport, provided with the Android NDK, has neither or . -# define GTEST_HAS_TR1_TUPLE 0 -# else -// The user didn't tell us not to do it, so we assume it's OK. -# define GTEST_HAS_TR1_TUPLE 1 -# endif -#endif // GTEST_HAS_TR1_TUPLE - -// Determines whether Google Test's own tr1 tuple implementation -// should be used. -#ifndef GTEST_USE_OWN_TR1_TUPLE -// The user didn't tell us, so we need to figure it out. - -// We use our own TR1 tuple if we aren't sure the user has an -// implementation of it already. At this time, libstdc++ 4.0.0+ and -// MSVC 2010 are the only mainstream standard libraries that come -// with a TR1 tuple implementation. NVIDIA's CUDA NVCC compiler -// pretends to be GCC by defining __GNUC__ and friends, but cannot -// compile GCC's tuple implementation. MSVC 2008 (9.0) provides TR1 -// tuple in a 323 MB Feature Pack download, which we cannot assume the -// user has. QNX's QCC compiler is a modified GCC but it doesn't -// support TR1 tuple. libc++ only provides std::tuple, in C++11 mode, -// and it can be used with some compilers that define __GNUC__. -# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \ - && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600 -# define GTEST_ENV_HAS_TR1_TUPLE_ 1 -# endif - -// C++11 specifies that provides std::tuple. Use that if gtest is used -// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6 -// can build with clang but need to use gcc4.2's libstdc++). -# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325) -# define GTEST_ENV_HAS_STD_TUPLE_ 1 -# endif - -# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_ -# define GTEST_USE_OWN_TR1_TUPLE 0 -# else -# define GTEST_USE_OWN_TR1_TUPLE 1 -# endif - -#endif // GTEST_USE_OWN_TR1_TUPLE - -// To avoid conditional compilation everywhere, we make it -// gtest-port.h's responsibility to #include the header implementing -// tr1/tuple. -#if GTEST_HAS_TR1_TUPLE - -# if GTEST_USE_OWN_TR1_TUPLE -// This file was GENERATED by command: -// pump.py gtest-tuple.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2009 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -// Implements a subset of TR1 tuple needed by Google Test and Google Mock. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ - -#include // For ::std::pair. - -// The compiler used in Symbian has a bug that prevents us from declaring the -// tuple template as a friend (it complains that tuple is redefined). This -// hack bypasses the bug by declaring the members that should otherwise be -// private as public. -// Sun Studio versions < 12 also have the above bug. -#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) -# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: -#else -# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ - template friend class tuple; \ - private: -#endif - -// GTEST_n_TUPLE_(T) is the type of an n-tuple. -#define GTEST_0_TUPLE_(T) tuple<> -#define GTEST_1_TUPLE_(T) tuple -#define GTEST_2_TUPLE_(T) tuple -#define GTEST_3_TUPLE_(T) tuple -#define GTEST_4_TUPLE_(T) tuple -#define GTEST_5_TUPLE_(T) tuple -#define GTEST_6_TUPLE_(T) tuple -#define GTEST_7_TUPLE_(T) tuple -#define GTEST_8_TUPLE_(T) tuple -#define GTEST_9_TUPLE_(T) tuple -#define GTEST_10_TUPLE_(T) tuple - -// GTEST_n_TYPENAMES_(T) declares a list of n typenames. -#define GTEST_0_TYPENAMES_(T) -#define GTEST_1_TYPENAMES_(T) typename T##0 -#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1 -#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2 -#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3 -#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4 -#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5 -#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6 -#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, typename T##7 -#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, \ - typename T##7, typename T##8 -#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, \ - typename T##7, typename T##8, typename T##9 - -// In theory, defining stuff in the ::std namespace is undefined -// behavior. We can do this as we are playing the role of a standard -// library vendor. -namespace std { -namespace tr1 { - -template -class tuple; - -// Anything in namespace gtest_internal is Google Test's INTERNAL -// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. -namespace gtest_internal { - -// ByRef::type is T if T is a reference; otherwise it's const T&. -template -struct ByRef { typedef const T& type; }; // NOLINT -template -struct ByRef { typedef T& type; }; // NOLINT - -// A handy wrapper for ByRef. -#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type - -// AddRef::type is T if T is a reference; otherwise it's T&. This -// is the same as tr1::add_reference::type. -template -struct AddRef { typedef T& type; }; // NOLINT -template -struct AddRef { typedef T& type; }; // NOLINT - -// A handy wrapper for AddRef. -#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type - -// A helper for implementing get(). -template class Get; - -// A helper for implementing tuple_element. kIndexValid is true -// iff k < the number of fields in tuple type T. -template -struct TupleElement; - -template -struct TupleElement { - typedef T0 type; -}; - -template -struct TupleElement { - typedef T1 type; -}; - -template -struct TupleElement { - typedef T2 type; -}; - -template -struct TupleElement { - typedef T3 type; -}; - -template -struct TupleElement { - typedef T4 type; -}; - -template -struct TupleElement { - typedef T5 type; -}; - -template -struct TupleElement { - typedef T6 type; -}; - -template -struct TupleElement { - typedef T7 type; -}; - -template -struct TupleElement { - typedef T8 type; -}; - -template -struct TupleElement { - typedef T9 type; -}; - -} // namespace gtest_internal - -template <> -class tuple<> { - public: - tuple() {} - tuple(const tuple& /* t */) {} - tuple& operator=(const tuple& /* t */) { return *this; } -}; - -template -class GTEST_1_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {} - - tuple(const tuple& t) : f0_(t.f0_) {} - - template - tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_1_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) { - f0_ = t.f0_; - return *this; - } - - T0 f0_; -}; - -template -class GTEST_2_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0), - f1_(f1) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {} - - template - tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {} - template - tuple(const ::std::pair& p) : f0_(p.first), f1_(p.second) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_2_TUPLE_(U)& t) { - return CopyFrom(t); - } - template - tuple& operator=(const ::std::pair& p) { - f0_ = p.first; - f1_ = p.second; - return *this; - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - return *this; - } - - T0 f0_; - T1 f1_; -}; - -template -class GTEST_3_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} - - template - tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_3_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; -}; - -template -class GTEST_4_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {} - - template - tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_4_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; -}; - -template -class GTEST_5_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, - GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_) {} - - template - tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_5_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; -}; - -template -class GTEST_6_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_) {} - - template - tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_6_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; -}; - -template -class GTEST_7_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3), f4_(f4), f5_(f5), f6_(f6) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} - - template - tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_7_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; -}; - -template -class GTEST_8_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, - GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5), f6_(f6), f7_(f7) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} - - template - tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_8_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; -}; - -template -class GTEST_9_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, - GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5), f6_(f6), f7_(f7), f8_(f8) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} - - template - tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_9_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - f8_ = t.f8_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; - T8 f8_; -}; - -template -class tuple { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(), - f9_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, - GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {} - - template - tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), - f9_(t.f9_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_10_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - f8_ = t.f8_; - f9_ = t.f9_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; - T8 f8_; - T9 f9_; -}; - -// 6.1.3.2 Tuple creation functions. - -// Known limitations: we don't support passing an -// std::tr1::reference_wrapper to make_tuple(). And we don't -// implement tie(). - -inline tuple<> make_tuple() { return tuple<>(); } - -template -inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) { - return GTEST_1_TUPLE_(T)(f0); -} - -template -inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) { - return GTEST_2_TUPLE_(T)(f0, f1); -} - -template -inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) { - return GTEST_3_TUPLE_(T)(f0, f1, f2); -} - -template -inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3) { - return GTEST_4_TUPLE_(T)(f0, f1, f2, f3); -} - -template -inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4) { - return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4); -} - -template -inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5) { - return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5); -} - -template -inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6) { - return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6); -} - -template -inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) { - return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7); -} - -template -inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, - const T8& f8) { - return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8); -} - -template -inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, - const T8& f8, const T9& f9) { - return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9); -} - -// 6.1.3.3 Tuple helper classes. - -template struct tuple_size; - -template -struct tuple_size { - static const int value = 0; -}; - -template -struct tuple_size { - static const int value = 1; -}; - -template -struct tuple_size { - static const int value = 2; -}; - -template -struct tuple_size { - static const int value = 3; -}; - -template -struct tuple_size { - static const int value = 4; -}; - -template -struct tuple_size { - static const int value = 5; -}; - -template -struct tuple_size { - static const int value = 6; -}; - -template -struct tuple_size { - static const int value = 7; -}; - -template -struct tuple_size { - static const int value = 8; -}; - -template -struct tuple_size { - static const int value = 9; -}; - -template -struct tuple_size { - static const int value = 10; -}; - -template -struct tuple_element { - typedef typename gtest_internal::TupleElement< - k < (tuple_size::value), k, Tuple>::type type; -}; - -#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element::type - -// 6.1.3.4 Element access. - -namespace gtest_internal { - -template <> -class Get<0> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) - Field(Tuple& t) { return t.f0_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) - ConstField(const Tuple& t) { return t.f0_; } -}; - -template <> -class Get<1> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) - Field(Tuple& t) { return t.f1_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) - ConstField(const Tuple& t) { return t.f1_; } -}; - -template <> -class Get<2> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) - Field(Tuple& t) { return t.f2_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) - ConstField(const Tuple& t) { return t.f2_; } -}; - -template <> -class Get<3> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) - Field(Tuple& t) { return t.f3_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) - ConstField(const Tuple& t) { return t.f3_; } -}; - -template <> -class Get<4> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) - Field(Tuple& t) { return t.f4_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) - ConstField(const Tuple& t) { return t.f4_; } -}; - -template <> -class Get<5> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) - Field(Tuple& t) { return t.f5_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) - ConstField(const Tuple& t) { return t.f5_; } -}; - -template <> -class Get<6> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) - Field(Tuple& t) { return t.f6_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) - ConstField(const Tuple& t) { return t.f6_; } -}; - -template <> -class Get<7> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) - Field(Tuple& t) { return t.f7_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) - ConstField(const Tuple& t) { return t.f7_; } -}; - -template <> -class Get<8> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) - Field(Tuple& t) { return t.f8_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) - ConstField(const Tuple& t) { return t.f8_; } -}; - -template <> -class Get<9> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) - Field(Tuple& t) { return t.f9_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) - ConstField(const Tuple& t) { return t.f9_; } -}; - -} // namespace gtest_internal - -template -GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) -get(GTEST_10_TUPLE_(T)& t) { - return gtest_internal::Get::Field(t); -} - -template -GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) -get(const GTEST_10_TUPLE_(T)& t) { - return gtest_internal::Get::ConstField(t); -} - -// 6.1.3.5 Relational operators - -// We only implement == and !=, as we don't have a need for the rest yet. - -namespace gtest_internal { - -// SameSizeTuplePrefixComparator::Eq(t1, t2) returns true if the -// first k fields of t1 equals the first k fields of t2. -// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if -// k1 != k2. -template -struct SameSizeTuplePrefixComparator; - -template <> -struct SameSizeTuplePrefixComparator<0, 0> { - template - static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { - return true; - } -}; - -template -struct SameSizeTuplePrefixComparator { - template - static bool Eq(const Tuple1& t1, const Tuple2& t2) { - return SameSizeTuplePrefixComparator::Eq(t1, t2) && - ::std::tr1::get(t1) == ::std::tr1::get(t2); - } -}; - -} // namespace gtest_internal - -template -inline bool operator==(const GTEST_10_TUPLE_(T)& t, - const GTEST_10_TUPLE_(U)& u) { - return gtest_internal::SameSizeTuplePrefixComparator< - tuple_size::value, - tuple_size::value>::Eq(t, u); -} - -template -inline bool operator!=(const GTEST_10_TUPLE_(T)& t, - const GTEST_10_TUPLE_(U)& u) { return !(t == u); } - -// 6.1.4 Pairs. -// Unimplemented. - -} // namespace tr1 -} // namespace std - -#undef GTEST_0_TUPLE_ -#undef GTEST_1_TUPLE_ -#undef GTEST_2_TUPLE_ -#undef GTEST_3_TUPLE_ -#undef GTEST_4_TUPLE_ -#undef GTEST_5_TUPLE_ -#undef GTEST_6_TUPLE_ -#undef GTEST_7_TUPLE_ -#undef GTEST_8_TUPLE_ -#undef GTEST_9_TUPLE_ -#undef GTEST_10_TUPLE_ - -#undef GTEST_0_TYPENAMES_ -#undef GTEST_1_TYPENAMES_ -#undef GTEST_2_TYPENAMES_ -#undef GTEST_3_TYPENAMES_ -#undef GTEST_4_TYPENAMES_ -#undef GTEST_5_TYPENAMES_ -#undef GTEST_6_TYPENAMES_ -#undef GTEST_7_TYPENAMES_ -#undef GTEST_8_TYPENAMES_ -#undef GTEST_9_TYPENAMES_ -#undef GTEST_10_TYPENAMES_ - -#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ -#undef GTEST_BY_REF_ -#undef GTEST_ADD_REF_ -#undef GTEST_TUPLE_ELEMENT_ - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ -# elif GTEST_ENV_HAS_STD_TUPLE_ -# include -// C++11 puts its tuple into the ::std namespace rather than -// ::std::tr1. gtest expects tuple to live in ::std::tr1, so put it there. -// This causes undefined behavior, but supported compilers react in -// the way we intend. -namespace std { -namespace tr1 { -using ::std::get; -using ::std::make_tuple; -using ::std::tuple; -using ::std::tuple_element; -using ::std::tuple_size; -} -} - -# elif GTEST_OS_SYMBIAN - -// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to -// use STLport's tuple implementation, which unfortunately doesn't -// work as the copy of STLport distributed with Symbian is incomplete. -// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to -// use its own tuple implementation. -# ifdef BOOST_HAS_TR1_TUPLE -# undef BOOST_HAS_TR1_TUPLE -# endif // BOOST_HAS_TR1_TUPLE - -// This prevents , which defines -// BOOST_HAS_TR1_TUPLE, from being #included by Boost's . -# define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED -# include - -# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000) -// GCC 4.0+ implements tr1/tuple in the header. This does -// not conform to the TR1 spec, which requires the header to be . - -# if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 -// Until version 4.3.2, gcc has a bug that causes , -// which is #included by , to not compile when RTTI is -// disabled. _TR1_FUNCTIONAL is the header guard for -// . Hence the following #define is a hack to prevent -// from being included. -# define _TR1_FUNCTIONAL 1 -# include -# undef _TR1_FUNCTIONAL // Allows the user to #include - // if he chooses to. -# else -# include // NOLINT -# endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 - -# else -// If the compiler is not GCC 4.0+, we assume the user is using a -// spec-conforming TR1 implementation. -# include // NOLINT -# endif // GTEST_USE_OWN_TR1_TUPLE - -#endif // GTEST_HAS_TR1_TUPLE - -// Determines whether clone(2) is supported. -// Usually it will only be available on Linux, excluding -// Linux on the Itanium architecture. -// Also see http://linux.die.net/man/2/clone. -#ifndef GTEST_HAS_CLONE -// The user didn't tell us, so we need to figure it out. - -# if GTEST_OS_LINUX && !defined(__ia64__) -# if GTEST_OS_LINUX_ANDROID -// On Android, clone() is only available on ARM starting with Gingerbread. -# if defined(__arm__) && __ANDROID_API__ >= 9 -# define GTEST_HAS_CLONE 1 -# else -# define GTEST_HAS_CLONE 0 -# endif -# else -# define GTEST_HAS_CLONE 1 -# endif -# else -# define GTEST_HAS_CLONE 0 -# endif // GTEST_OS_LINUX && !defined(__ia64__) - -#endif // GTEST_HAS_CLONE - -// Determines whether to support stream redirection. This is used to test -// output correctness and to implement death tests. -#ifndef GTEST_HAS_STREAM_REDIRECTION -// By default, we assume that stream redirection is supported on all -// platforms except known mobile ones. -# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN -# define GTEST_HAS_STREAM_REDIRECTION 0 -# else -# define GTEST_HAS_STREAM_REDIRECTION 1 -# endif // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN -#endif // GTEST_HAS_STREAM_REDIRECTION - -// Determines whether to support death tests. -// Google Test does not support death tests for VC 7.1 and earlier as -// abort() in a VC 7.1 application compiled as GUI in debug config -// pops up a dialog window that cannot be suppressed programmatically. -#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ - (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \ - (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ - GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \ - GTEST_OS_OPENBSD || GTEST_OS_QNX) -# define GTEST_HAS_DEATH_TEST 1 -# include // NOLINT -#endif - -// We don't support MSVC 7.1 with exceptions disabled now. Therefore -// all the compilers we care about are adequate for supporting -// value-parameterized tests. -#define GTEST_HAS_PARAM_TEST 1 - -// Determines whether to support type-driven tests. - -// Typed tests need and variadic macros, which GCC, VC++ 8.0, -// Sun Pro CC, IBM Visual Age, and HP aCC support. -#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \ - defined(__IBMCPP__) || defined(__HP_aCC) -# define GTEST_HAS_TYPED_TEST 1 -# define GTEST_HAS_TYPED_TEST_P 1 -#endif - -// Determines whether to support Combine(). This only makes sense when -// value-parameterized tests are enabled. The implementation doesn't -// work on Sun Studio since it doesn't understand templated conversion -// operators. -#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC) -# define GTEST_HAS_COMBINE 1 -#endif - -// Determines whether the system compiler uses UTF-16 for encoding wide strings. -#define GTEST_WIDE_STRING_USES_UTF16_ \ - (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX) - -// Determines whether test results can be streamed to a socket. -#if GTEST_OS_LINUX -# define GTEST_CAN_STREAM_RESULTS_ 1 -#endif - -// Defines some utility macros. - -// The GNU compiler emits a warning if nested "if" statements are followed by -// an "else" statement and braces are not used to explicitly disambiguate the -// "else" binding. This leads to problems with code like: -// -// if (gate) -// ASSERT_*(condition) << "Some message"; -// -// The "switch (0) case 0:" idiom is used to suppress this. -#ifdef __INTEL_COMPILER -# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ -#else -# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT -#endif - -// Use this annotation at the end of a struct/class definition to -// prevent the compiler from optimizing away instances that are never -// used. This is useful when all interesting logic happens inside the -// c'tor and / or d'tor. Example: -// -// struct Foo { -// Foo() { ... } -// } GTEST_ATTRIBUTE_UNUSED_; -// -// Also use it after a variable or parameter declaration to tell the -// compiler the variable/parameter does not have to be used. -#if defined(__GNUC__) && !defined(COMPILER_ICC) -# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) -#else -# define GTEST_ATTRIBUTE_UNUSED_ -#endif - -// A macro to disallow operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_ASSIGN_(type)\ - void operator=(type const &) - -// A macro to disallow copy constructor and operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\ - type(type const &);\ - GTEST_DISALLOW_ASSIGN_(type) - -// Tell the compiler to warn about unused return values for functions declared -// with this macro. The macro should be used on function declarations -// following the argument list: -// -// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; -#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC) -# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) -#else -# define GTEST_MUST_USE_RESULT_ -#endif // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC - -// Determine whether the compiler supports Microsoft's Structured Exception -// Handling. This is supported by several Windows compilers but generally -// does not exist on any other system. -#ifndef GTEST_HAS_SEH -// The user didn't tell us, so we need to figure it out. - -# if defined(_MSC_VER) || defined(__BORLANDC__) -// These two compilers are known to support SEH. -# define GTEST_HAS_SEH 1 -# else -// Assume no SEH. -# define GTEST_HAS_SEH 0 -# endif - -#endif // GTEST_HAS_SEH - -#ifdef _MSC_VER - -# if GTEST_LINKED_AS_SHARED_LIBRARY -# define GTEST_API_ __declspec(dllimport) -# elif GTEST_CREATE_SHARED_LIBRARY -# define GTEST_API_ __declspec(dllexport) -# endif - -#endif // _MSC_VER - -#ifndef GTEST_API_ -# define GTEST_API_ -#endif - -#ifdef __GNUC__ -// Ask the compiler to never inline a given function. -# define GTEST_NO_INLINE_ __attribute__((noinline)) -#else -# define GTEST_NO_INLINE_ -#endif - -// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. -#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) -# define GTEST_HAS_CXXABI_H_ 1 -#else -# define GTEST_HAS_CXXABI_H_ 0 -#endif - -namespace testing { - -class Message; - -namespace internal { - -// A secret type that Google Test users don't know about. It has no -// definition on purpose. Therefore it's impossible to create a -// Secret object, which is what we want. -class Secret; - -// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time -// expression is true. For example, you could use it to verify the -// size of a static array: -// -// GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, -// content_type_names_incorrect_size); -// -// or to make sure a struct is smaller than a certain size: -// -// GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large); -// -// The second argument to the macro is the name of the variable. If -// the expression is false, most compilers will issue a warning/error -// containing the name of the variable. - -template -struct CompileAssert { -}; - -#define GTEST_COMPILE_ASSERT_(expr, msg) \ - typedef ::testing::internal::CompileAssert<(static_cast(expr))> \ - msg[static_cast(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_ - -// Implementation details of GTEST_COMPILE_ASSERT_: -// -// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1 -// elements (and thus is invalid) when the expression is false. -// -// - The simpler definition -// -// #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1] -// -// does not work, as gcc supports variable-length arrays whose sizes -// are determined at run-time (this is gcc's extension and not part -// of the C++ standard). As a result, gcc fails to reject the -// following code with the simple definition: -// -// int foo; -// GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is -// // not a compile-time constant. -// -// - By using the type CompileAssert<(bool(expr))>, we ensures that -// expr is a compile-time constant. (Template arguments must be -// determined at compile-time.) -// -// - The outter parentheses in CompileAssert<(bool(expr))> are necessary -// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written -// -// CompileAssert -// -// instead, these compilers will refuse to compile -// -// GTEST_COMPILE_ASSERT_(5 > 0, some_message); -// -// (They seem to think the ">" in "5 > 0" marks the end of the -// template argument list.) -// -// - The array size is (bool(expr) ? 1 : -1), instead of simply -// -// ((expr) ? 1 : -1). -// -// This is to avoid running into a bug in MS VC 7.1, which -// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. - -// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h. -// -// This template is declared, but intentionally undefined. -template -struct StaticAssertTypeEqHelper; - -template -struct StaticAssertTypeEqHelper {}; - -#if GTEST_HAS_GLOBAL_STRING -typedef ::string string; -#else -typedef ::std::string string; -#endif // GTEST_HAS_GLOBAL_STRING - -#if GTEST_HAS_GLOBAL_WSTRING -typedef ::wstring wstring; -#elif GTEST_HAS_STD_WSTRING -typedef ::std::wstring wstring; -#endif // GTEST_HAS_GLOBAL_WSTRING - -// A helper for suppressing warnings on constant condition. It just -// returns 'condition'. -GTEST_API_ bool IsTrue(bool condition); - -// Defines scoped_ptr. - -// This implementation of scoped_ptr is PARTIAL - it only contains -// enough stuff to satisfy Google Test's need. -template -class scoped_ptr { - public: - typedef T element_type; - - explicit scoped_ptr(T* p = NULL) : ptr_(p) {} - ~scoped_ptr() { reset(); } - - T& operator*() const { return *ptr_; } - T* operator->() const { return ptr_; } - T* get() const { return ptr_; } - - T* release() { - T* const ptr = ptr_; - ptr_ = NULL; - return ptr; - } - - void reset(T* p = NULL) { - if (p != ptr_) { - if (IsTrue(sizeof(T) > 0)) { // Makes sure T is a complete type. - delete ptr_; - } - ptr_ = p; - } - } - - private: - T* ptr_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr); -}; - -// Defines RE. - -// A simple C++ wrapper for . It uses the POSIX Extended -// Regular Expression syntax. -class GTEST_API_ RE { - public: - // A copy constructor is required by the Standard to initialize object - // references from r-values. - RE(const RE& other) { Init(other.pattern()); } - - // Constructs an RE from a string. - RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT - -#if GTEST_HAS_GLOBAL_STRING - - RE(const ::string& regex) { Init(regex.c_str()); } // NOLINT - -#endif // GTEST_HAS_GLOBAL_STRING - - RE(const char* regex) { Init(regex); } // NOLINT - ~RE(); - - // Returns the string representation of the regex. - const char* pattern() const { return pattern_; } - - // FullMatch(str, re) returns true iff regular expression re matches - // the entire str. - // PartialMatch(str, re) returns true iff regular expression re - // matches a substring of str (including str itself). - // - // TODO(wan@google.com): make FullMatch() and PartialMatch() work - // when str contains NUL characters. - static bool FullMatch(const ::std::string& str, const RE& re) { - return FullMatch(str.c_str(), re); - } - static bool PartialMatch(const ::std::string& str, const RE& re) { - return PartialMatch(str.c_str(), re); - } - -#if GTEST_HAS_GLOBAL_STRING - - static bool FullMatch(const ::string& str, const RE& re) { - return FullMatch(str.c_str(), re); - } - static bool PartialMatch(const ::string& str, const RE& re) { - return PartialMatch(str.c_str(), re); - } - -#endif // GTEST_HAS_GLOBAL_STRING - - static bool FullMatch(const char* str, const RE& re); - static bool PartialMatch(const char* str, const RE& re); - - private: - void Init(const char* regex); - - // We use a const char* instead of an std::string, as Google Test used to be - // used where std::string is not available. TODO(wan@google.com): change to - // std::string. - const char* pattern_; - bool is_valid_; - -#if GTEST_USES_POSIX_RE - - regex_t full_regex_; // For FullMatch(). - regex_t partial_regex_; // For PartialMatch(). - -#else // GTEST_USES_SIMPLE_RE - - const char* full_pattern_; // For FullMatch(); - -#endif - - GTEST_DISALLOW_ASSIGN_(RE); -}; - -// Formats a source file path and a line number as they would appear -// in an error message from the compiler used to compile this code. -GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); - -// Formats a file location for compiler-independent XML output. -// Although this function is not platform dependent, we put it next to -// FormatFileLocation in order to contrast the two functions. -GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, - int line); - -// Defines logging utilities: -// GTEST_LOG_(severity) - logs messages at the specified severity level. The -// message itself is streamed into the macro. -// LogToStderr() - directs all log messages to stderr. -// FlushInfoLog() - flushes informational log messages. - -enum GTestLogSeverity { - GTEST_INFO, - GTEST_WARNING, - GTEST_ERROR, - GTEST_FATAL -}; - -// Formats log entry severity, provides a stream object for streaming the -// log message, and terminates the message with a newline when going out of -// scope. -class GTEST_API_ GTestLog { - public: - GTestLog(GTestLogSeverity severity, const char* file, int line); - - // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. - ~GTestLog(); - - ::std::ostream& GetStream() { return ::std::cerr; } - - private: - const GTestLogSeverity severity_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); -}; - -#define GTEST_LOG_(severity) \ - ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ - __FILE__, __LINE__).GetStream() - -inline void LogToStderr() {} -inline void FlushInfoLog() { fflush(NULL); } - -// INTERNAL IMPLEMENTATION - DO NOT USE. -// -// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition -// is not satisfied. -// Synopsys: -// GTEST_CHECK_(boolean_condition); -// or -// GTEST_CHECK_(boolean_condition) << "Additional message"; -// -// This checks the condition and if the condition is not satisfied -// it prints message about the condition violation, including the -// condition itself, plus additional message streamed into it, if any, -// and then it aborts the program. It aborts the program irrespective of -// whether it is built in the debug mode or not. -#define GTEST_CHECK_(condition) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::IsTrue(condition)) \ - ; \ - else \ - GTEST_LOG_(FATAL) << "Condition " #condition " failed. " - -// An all-mode assert to verify that the given POSIX-style function -// call returns 0 (indicating success). Known limitation: this -// doesn't expand to a balanced 'if' statement, so enclose the macro -// in {} if you need to use it as the only statement in an 'if' -// branch. -#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ - if (const int gtest_error = (posix_call)) \ - GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ - << gtest_error - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Use ImplicitCast_ as a safe version of static_cast for upcasting in -// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a -// const Foo*). When you use ImplicitCast_, the compiler checks that -// the cast is safe. Such explicit ImplicitCast_s are necessary in -// surprisingly many situations where C++ demands an exact type match -// instead of an argument type convertable to a target type. -// -// The syntax for using ImplicitCast_ is the same as for static_cast: -// -// ImplicitCast_(expr) -// -// ImplicitCast_ would have been part of the C++ standard library, -// but the proposal was submitted too late. It will probably make -// its way into the language in the future. -// -// This relatively ugly name is intentional. It prevents clashes with -// similar functions users may have (e.g., implicit_cast). The internal -// namespace alone is not enough because the function can be found by ADL. -template -inline To ImplicitCast_(To x) { return x; } - -// When you upcast (that is, cast a pointer from type Foo to type -// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts -// always succeed. When you downcast (that is, cast a pointer from -// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because -// how do you know the pointer is really of type SubclassOfFoo? It -// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, -// when you downcast, you should use this macro. In debug mode, we -// use dynamic_cast<> to double-check the downcast is legal (we die -// if it's not). In normal mode, we do the efficient static_cast<> -// instead. Thus, it's important to test in debug mode to make sure -// the cast is legal! -// This is the only place in the code we should use dynamic_cast<>. -// In particular, you SHOULDN'T be using dynamic_cast<> in order to -// do RTTI (eg code like this: -// if (dynamic_cast(foo)) HandleASubclass1Object(foo); -// if (dynamic_cast(foo)) HandleASubclass2Object(foo); -// You should design the code some other way not to need this. -// -// This relatively ugly name is intentional. It prevents clashes with -// similar functions users may have (e.g., down_cast). The internal -// namespace alone is not enough because the function can be found by ADL. -template // use like this: DownCast_(foo); -inline To DownCast_(From* f) { // so we only accept pointers - // Ensures that To is a sub-type of From *. This test is here only - // for compile-time type checking, and has no overhead in an - // optimized build at run-time, as it will be optimized away - // completely. - if (false) { - const To to = NULL; - ::testing::internal::ImplicitCast_(to); - } - -#if GTEST_HAS_RTTI - // RTTI: debug mode only! - GTEST_CHECK_(f == NULL || dynamic_cast(f) != NULL); -#endif - return static_cast(f); -} - -// Downcasts the pointer of type Base to Derived. -// Derived must be a subclass of Base. The parameter MUST -// point to a class of type Derived, not any subclass of it. -// When RTTI is available, the function performs a runtime -// check to enforce this. -template -Derived* CheckedDowncastToActualType(Base* base) { -#if GTEST_HAS_RTTI - GTEST_CHECK_(typeid(*base) == typeid(Derived)); - return dynamic_cast(base); // NOLINT -#else - return static_cast(base); // Poor man's downcast. -#endif -} - -#if GTEST_HAS_STREAM_REDIRECTION - -// Defines the stderr capturer: -// CaptureStdout - starts capturing stdout. -// GetCapturedStdout - stops capturing stdout and returns the captured string. -// CaptureStderr - starts capturing stderr. -// GetCapturedStderr - stops capturing stderr and returns the captured string. -// -GTEST_API_ void CaptureStdout(); -GTEST_API_ std::string GetCapturedStdout(); -GTEST_API_ void CaptureStderr(); -GTEST_API_ std::string GetCapturedStderr(); - -#endif // GTEST_HAS_STREAM_REDIRECTION - - -#if GTEST_HAS_DEATH_TEST - -const ::std::vector& GetInjectableArgvs(); -void SetInjectableArgvs(const ::std::vector* - new_argvs); - -// A copy of all command line arguments. Set by InitGoogleTest(). -extern ::std::vector g_argvs; - -#endif // GTEST_HAS_DEATH_TEST - -// Defines synchronization primitives. - -#if GTEST_HAS_PTHREAD - -// Sleeps for (roughly) n milli-seconds. This function is only for -// testing Google Test's own constructs. Don't use it in user tests, -// either directly or indirectly. -inline void SleepMilliseconds(int n) { - const timespec time = { - 0, // 0 seconds. - n * 1000L * 1000L, // And n ms. - }; - nanosleep(&time, NULL); -} - -// Allows a controller thread to pause execution of newly created -// threads until notified. Instances of this class must be created -// and destroyed in the controller thread. -// -// This class is only for testing Google Test's own constructs. Do not -// use it in user tests, either directly or indirectly. -class Notification { - public: - Notification() : notified_(false) { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); - } - ~Notification() { - pthread_mutex_destroy(&mutex_); - } - - // Notifies all threads created with this notification to start. Must - // be called from the controller thread. - void Notify() { - pthread_mutex_lock(&mutex_); - notified_ = true; - pthread_mutex_unlock(&mutex_); - } - - // Blocks until the controller thread notifies. Must be called from a test - // thread. - void WaitForNotification() { - for (;;) { - pthread_mutex_lock(&mutex_); - const bool notified = notified_; - pthread_mutex_unlock(&mutex_); - if (notified) - break; - SleepMilliseconds(10); - } - } - - private: - pthread_mutex_t mutex_; - bool notified_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); -}; - -// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. -// Consequently, it cannot select a correct instantiation of ThreadWithParam -// in order to call its Run(). Introducing ThreadWithParamBase as a -// non-templated base class for ThreadWithParam allows us to bypass this -// problem. -class ThreadWithParamBase { - public: - virtual ~ThreadWithParamBase() {} - virtual void Run() = 0; -}; - -// pthread_create() accepts a pointer to a function type with the C linkage. -// According to the Standard (7.5/1), function types with different linkages -// are different even if they are otherwise identical. Some compilers (for -// example, SunStudio) treat them as different types. Since class methods -// cannot be defined with C-linkage we need to define a free C-function to -// pass into pthread_create(). -extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { - static_cast(thread)->Run(); - return NULL; -} - -// Helper class for testing Google Test's multi-threading constructs. -// To use it, write: -// -// void ThreadFunc(int param) { /* Do things with param */ } -// Notification thread_can_start; -// ... -// // The thread_can_start parameter is optional; you can supply NULL. -// ThreadWithParam thread(&ThreadFunc, 5, &thread_can_start); -// thread_can_start.Notify(); -// -// These classes are only for testing Google Test's own constructs. Do -// not use them in user tests, either directly or indirectly. -template -class ThreadWithParam : public ThreadWithParamBase { - public: - typedef void (*UserThreadFunc)(T); - - ThreadWithParam( - UserThreadFunc func, T param, Notification* thread_can_start) - : func_(func), - param_(param), - thread_can_start_(thread_can_start), - finished_(false) { - ThreadWithParamBase* const base = this; - // The thread can be created only after all fields except thread_ - // have been initialized. - GTEST_CHECK_POSIX_SUCCESS_( - pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base)); - } - ~ThreadWithParam() { Join(); } - - void Join() { - if (!finished_) { - GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0)); - finished_ = true; - } - } - - virtual void Run() { - if (thread_can_start_ != NULL) - thread_can_start_->WaitForNotification(); - func_(param_); - } - - private: - const UserThreadFunc func_; // User-supplied thread function. - const T param_; // User-supplied parameter to the thread function. - // When non-NULL, used to block execution until the controller thread - // notifies. - Notification* const thread_can_start_; - bool finished_; // true iff we know that the thread function has finished. - pthread_t thread_; // The native thread object. - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); -}; - -// MutexBase and Mutex implement mutex on pthreads-based platforms. They -// are used in conjunction with class MutexLock: -// -// Mutex mutex; -// ... -// MutexLock lock(&mutex); // Acquires the mutex and releases it at the end -// // of the current scope. -// -// MutexBase implements behavior for both statically and dynamically -// allocated mutexes. Do not use MutexBase directly. Instead, write -// the following to define a static mutex: -// -// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); -// -// You can forward declare a static mutex like this: -// -// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); -// -// To create a dynamic mutex, just define an object of type Mutex. -class MutexBase { - public: - // Acquires this mutex. - void Lock() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); - owner_ = pthread_self(); - has_owner_ = true; - } - - // Releases this mutex. - void Unlock() { - // Since the lock is being released the owner_ field should no longer be - // considered valid. We don't protect writing to has_owner_ here, as it's - // the caller's responsibility to ensure that the current thread holds the - // mutex when this is called. - has_owner_ = false; - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); - } - - // Does nothing if the current thread holds the mutex. Otherwise, crashes - // with high probability. - void AssertHeld() const { - GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self())) - << "The current thread is not holding the mutex @" << this; - } - - // A static mutex may be used before main() is entered. It may even - // be used before the dynamic initialization stage. Therefore we - // must be able to initialize a static mutex object at link time. - // This means MutexBase has to be a POD and its member variables - // have to be public. - public: - pthread_mutex_t mutex_; // The underlying pthread mutex. - // has_owner_ indicates whether the owner_ field below contains a valid thread - // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All - // accesses to the owner_ field should be protected by a check of this field. - // An alternative might be to memset() owner_ to all zeros, but there's no - // guarantee that a zero'd pthread_t is necessarily invalid or even different - // from pthread_self(). - bool has_owner_; - pthread_t owner_; // The thread holding the mutex. -}; - -// Forward-declares a static mutex. -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ - extern ::testing::internal::MutexBase mutex - -// Defines and statically (i.e. at link time) initializes a static mutex. -// The initialization list here does not explicitly initialize each field, -// instead relying on default initialization for the unspecified fields. In -// particular, the owner_ field (a pthread_t) is not explicitly initialized. -// This allows initialization to work whether pthread_t is a scalar or struct. -// The flag -Wmissing-field-initializers must not be specified for this to work. -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ - ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false } - -// The Mutex class can only be used for mutexes created at runtime. It -// shares its API with MutexBase otherwise. -class Mutex : public MutexBase { - public: - Mutex() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); - has_owner_ = false; - } - ~Mutex() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); - } - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); -}; - -// We cannot name this class MutexLock as the ctor declaration would -// conflict with a macro named MutexLock, which is defined on some -// platforms. Hence the typedef trick below. -class GTestMutexLock { - public: - explicit GTestMutexLock(MutexBase* mutex) - : mutex_(mutex) { mutex_->Lock(); } - - ~GTestMutexLock() { mutex_->Unlock(); } - - private: - MutexBase* const mutex_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); -}; - -typedef GTestMutexLock MutexLock; - -// Helpers for ThreadLocal. - -// pthread_key_create() requires DeleteThreadLocalValue() to have -// C-linkage. Therefore it cannot be templatized to access -// ThreadLocal. Hence the need for class -// ThreadLocalValueHolderBase. -class ThreadLocalValueHolderBase { - public: - virtual ~ThreadLocalValueHolderBase() {} -}; - -// Called by pthread to delete thread-local data stored by -// pthread_setspecific(). -extern "C" inline void DeleteThreadLocalValue(void* value_holder) { - delete static_cast(value_holder); -} - -// Implements thread-local storage on pthreads-based systems. -// -// // Thread 1 -// ThreadLocal tl(100); // 100 is the default value for each thread. -// -// // Thread 2 -// tl.set(150); // Changes the value for thread 2 only. -// EXPECT_EQ(150, tl.get()); -// -// // Thread 1 -// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. -// tl.set(200); -// EXPECT_EQ(200, tl.get()); -// -// The template type argument T must have a public copy constructor. -// In addition, the default ThreadLocal constructor requires T to have -// a public default constructor. -// -// An object managed for a thread by a ThreadLocal instance is deleted -// when the thread exits. Or, if the ThreadLocal instance dies in -// that thread, when the ThreadLocal dies. It's the user's -// responsibility to ensure that all other threads using a ThreadLocal -// have exited when it dies, or the per-thread objects for those -// threads will not be deleted. -// -// Google Test only uses global ThreadLocal objects. That means they -// will die after main() has returned. Therefore, no per-thread -// object managed by Google Test will be leaked as long as all threads -// using Google Test have exited when main() returns. -template -class ThreadLocal { - public: - ThreadLocal() : key_(CreateKey()), - default_() {} - explicit ThreadLocal(const T& value) : key_(CreateKey()), - default_(value) {} - - ~ThreadLocal() { - // Destroys the managed object for the current thread, if any. - DeleteThreadLocalValue(pthread_getspecific(key_)); - - // Releases resources associated with the key. This will *not* - // delete managed objects for other threads. - GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); - } - - T* pointer() { return GetOrCreateValue(); } - const T* pointer() const { return GetOrCreateValue(); } - const T& get() const { return *pointer(); } - void set(const T& value) { *pointer() = value; } - - private: - // Holds a value of type T. - class ValueHolder : public ThreadLocalValueHolderBase { - public: - explicit ValueHolder(const T& value) : value_(value) {} - - T* pointer() { return &value_; } - - private: - T value_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); - }; - - static pthread_key_t CreateKey() { - pthread_key_t key; - // When a thread exits, DeleteThreadLocalValue() will be called on - // the object managed for that thread. - GTEST_CHECK_POSIX_SUCCESS_( - pthread_key_create(&key, &DeleteThreadLocalValue)); - return key; - } - - T* GetOrCreateValue() const { - ThreadLocalValueHolderBase* const holder = - static_cast(pthread_getspecific(key_)); - if (holder != NULL) { - return CheckedDowncastToActualType(holder)->pointer(); - } - - ValueHolder* const new_holder = new ValueHolder(default_); - ThreadLocalValueHolderBase* const holder_base = new_holder; - GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); - return new_holder->pointer(); - } - - // A key pthreads uses for looking up per-thread values. - const pthread_key_t key_; - const T default_; // The default value for each thread. - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); -}; - -# define GTEST_IS_THREADSAFE 1 - -#else // GTEST_HAS_PTHREAD - -// A dummy implementation of synchronization primitives (mutex, lock, -// and thread-local variable). Necessary for compiling Google Test where -// mutex is not supported - using Google Test in multiple threads is not -// supported on such platforms. - -class Mutex { - public: - Mutex() {} - void Lock() {} - void Unlock() {} - void AssertHeld() const {} -}; - -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ - extern ::testing::internal::Mutex mutex - -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex - -class GTestMutexLock { - public: - explicit GTestMutexLock(Mutex*) {} // NOLINT -}; - -typedef GTestMutexLock MutexLock; - -template -class ThreadLocal { - public: - ThreadLocal() : value_() {} - explicit ThreadLocal(const T& value) : value_(value) {} - T* pointer() { return &value_; } - const T* pointer() const { return &value_; } - const T& get() const { return value_; } - void set(const T& value) { value_ = value; } - private: - T value_; -}; - -// The above synchronization primitives have dummy implementations. -// Therefore Google Test is not thread-safe. -# define GTEST_IS_THREADSAFE 0 - -#endif // GTEST_HAS_PTHREAD - -// Returns the number of threads running in the process, or 0 to indicate that -// we cannot detect it. -GTEST_API_ size_t GetThreadCount(); - -// Passing non-POD classes through ellipsis (...) crashes the ARM -// compiler and generates a warning in Sun Studio. The Nokia Symbian -// and the IBM XL C/C++ compiler try to instantiate a copy constructor -// for objects passed through ellipsis (...), failing for uncopyable -// objects. We define this to ensure that only POD is passed through -// ellipsis on these systems. -#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC) -// We lose support for NULL detection where the compiler doesn't like -// passing non-POD classes through ellipsis (...). -# define GTEST_ELLIPSIS_NEEDS_POD_ 1 -#else -# define GTEST_CAN_COMPARE_NULL 1 -#endif - -// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between -// const T& and const T* in a function template. These compilers -// _can_ decide between class template specializations for T and T*, -// so a tr1::type_traits-like is_pointer works. -#if defined(__SYMBIAN32__) || defined(__IBMCPP__) -# define GTEST_NEEDS_IS_POINTER_ 1 -#endif - -template -struct bool_constant { - typedef bool_constant type; - static const bool value = bool_value; -}; -template const bool bool_constant::value; - -typedef bool_constant false_type; -typedef bool_constant true_type; - -template -struct is_pointer : public false_type {}; - -template -struct is_pointer : public true_type {}; - -template -struct IteratorTraits { - typedef typename Iterator::value_type value_type; -}; - -template -struct IteratorTraits { - typedef T value_type; -}; - -template -struct IteratorTraits { - typedef T value_type; -}; - -#if GTEST_OS_WINDOWS -# define GTEST_PATH_SEP_ "\\" -# define GTEST_HAS_ALT_PATH_SEP_ 1 -// The biggest signed integer type the compiler supports. -typedef __int64 BiggestInt; -#else -# define GTEST_PATH_SEP_ "/" -# define GTEST_HAS_ALT_PATH_SEP_ 0 -typedef long long BiggestInt; // NOLINT -#endif // GTEST_OS_WINDOWS - -// Utilities for char. - -// isspace(int ch) and friends accept an unsigned char or EOF. char -// may be signed, depending on the compiler (or compiler flags). -// Therefore we need to cast a char to unsigned char before calling -// isspace(), etc. - -inline bool IsAlpha(char ch) { - return isalpha(static_cast(ch)) != 0; -} -inline bool IsAlNum(char ch) { - return isalnum(static_cast(ch)) != 0; -} -inline bool IsDigit(char ch) { - return isdigit(static_cast(ch)) != 0; -} -inline bool IsLower(char ch) { - return islower(static_cast(ch)) != 0; -} -inline bool IsSpace(char ch) { - return isspace(static_cast(ch)) != 0; -} -inline bool IsUpper(char ch) { - return isupper(static_cast(ch)) != 0; -} -inline bool IsXDigit(char ch) { - return isxdigit(static_cast(ch)) != 0; -} -inline bool IsXDigit(wchar_t ch) { - const unsigned char low_byte = static_cast(ch); - return ch == low_byte && isxdigit(low_byte) != 0; -} - -inline char ToLower(char ch) { - return static_cast(tolower(static_cast(ch))); -} -inline char ToUpper(char ch) { - return static_cast(toupper(static_cast(ch))); -} - -// The testing::internal::posix namespace holds wrappers for common -// POSIX functions. These wrappers hide the differences between -// Windows/MSVC and POSIX systems. Since some compilers define these -// standard functions as macros, the wrapper cannot have the same name -// as the wrapped function. - -namespace posix { - -// Functions with a different name on Windows. - -#if GTEST_OS_WINDOWS - -typedef struct _stat StatStruct; - -# ifdef __BORLANDC__ -inline int IsATTY(int fd) { return isatty(fd); } -inline int StrCaseCmp(const char* s1, const char* s2) { - return stricmp(s1, s2); -} -inline char* StrDup(const char* src) { return strdup(src); } -# else // !__BORLANDC__ -# if GTEST_OS_WINDOWS_MOBILE -inline int IsATTY(int /* fd */) { return 0; } -# else -inline int IsATTY(int fd) { return _isatty(fd); } -# endif // GTEST_OS_WINDOWS_MOBILE -inline int StrCaseCmp(const char* s1, const char* s2) { - return _stricmp(s1, s2); -} -inline char* StrDup(const char* src) { return _strdup(src); } -# endif // __BORLANDC__ - -# if GTEST_OS_WINDOWS_MOBILE -inline int FileNo(FILE* file) { return reinterpret_cast(_fileno(file)); } -// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this -// time and thus not defined there. -# else -inline int FileNo(FILE* file) { return _fileno(file); } -inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } -inline int RmDir(const char* dir) { return _rmdir(dir); } -inline bool IsDir(const StatStruct& st) { - return (_S_IFDIR & st.st_mode) != 0; -} -# endif // GTEST_OS_WINDOWS_MOBILE - -#else - -typedef struct stat StatStruct; - -inline int FileNo(FILE* file) { return fileno(file); } -inline int IsATTY(int fd) { return isatty(fd); } -inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } -inline int StrCaseCmp(const char* s1, const char* s2) { - return strcasecmp(s1, s2); -} -inline char* StrDup(const char* src) { return strdup(src); } -inline int RmDir(const char* dir) { return rmdir(dir); } -inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } - -#endif // GTEST_OS_WINDOWS - -// Functions deprecated by MSVC 8.0. - -#ifdef _MSC_VER -// Temporarily disable warning 4996 (deprecated function). -# pragma warning(push) -# pragma warning(disable:4996) -#endif - -inline const char* StrNCpy(char* dest, const char* src, size_t n) { - return strncpy(dest, src, n); -} - -// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and -// StrError() aren't needed on Windows CE at this time and thus not -// defined there. - -#if !GTEST_OS_WINDOWS_MOBILE -inline int ChDir(const char* dir) { return chdir(dir); } -#endif -inline FILE* FOpen(const char* path, const char* mode) { - return fopen(path, mode); -} -#if !GTEST_OS_WINDOWS_MOBILE -inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { - return freopen(path, mode, stream); -} -inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } -#endif -inline int FClose(FILE* fp) { return fclose(fp); } -#if !GTEST_OS_WINDOWS_MOBILE -inline int Read(int fd, void* buf, unsigned int count) { - return static_cast(read(fd, buf, count)); -} -inline int Write(int fd, const void* buf, unsigned int count) { - return static_cast(write(fd, buf, count)); -} -inline int Close(int fd) { return close(fd); } -inline const char* StrError(int errnum) { return strerror(errnum); } -#endif -inline const char* GetEnv(const char* name) { -#if GTEST_OS_WINDOWS_MOBILE - // We are on Windows CE, which has no environment variables. - return NULL; -#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) - // Environment variables which we programmatically clear will be set to the - // empty string rather than unset (NULL). Handle that case. - const char* const env = getenv(name); - return (env != NULL && env[0] != '\0') ? env : NULL; -#else - return getenv(name); -#endif -} - -#ifdef _MSC_VER -# pragma warning(pop) // Restores the warning state. -#endif - -#if GTEST_OS_WINDOWS_MOBILE -// Windows CE has no C library. The abort() function is used in -// several places in Google Test. This implementation provides a reasonable -// imitation of standard behaviour. -void Abort(); -#else -inline void Abort() { abort(); } -#endif // GTEST_OS_WINDOWS_MOBILE - -} // namespace posix - -// MSVC "deprecates" snprintf and issues warnings wherever it is used. In -// order to avoid these warnings, we need to use _snprintf or _snprintf_s on -// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate -// function in order to achieve that. We use macro definition here because -// snprintf is a variadic function. -#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE -// MSVC 2005 and above support variadic macros. -# define GTEST_SNPRINTF_(buffer, size, format, ...) \ - _snprintf_s(buffer, size, size, format, __VA_ARGS__) -#elif defined(_MSC_VER) -// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't -// complain about _snprintf. -# define GTEST_SNPRINTF_ _snprintf -#else -# define GTEST_SNPRINTF_ snprintf -#endif - -// The maximum number a BiggestInt can represent. This definition -// works no matter BiggestInt is represented in one's complement or -// two's complement. -// -// We cannot rely on numeric_limits in STL, as __int64 and long long -// are not part of standard C++ and numeric_limits doesn't need to be -// defined for them. -const BiggestInt kMaxBiggestInt = - ~(static_cast(1) << (8*sizeof(BiggestInt) - 1)); - -// This template class serves as a compile-time function from size to -// type. It maps a size in bytes to a primitive type with that -// size. e.g. -// -// TypeWithSize<4>::UInt -// -// is typedef-ed to be unsigned int (unsigned integer made up of 4 -// bytes). -// -// Such functionality should belong to STL, but I cannot find it -// there. -// -// Google Test uses this class in the implementation of floating-point -// comparison. -// -// For now it only handles UInt (unsigned int) as that's all Google Test -// needs. Other types can be easily added in the future if need -// arises. -template -class TypeWithSize { - public: - // This prevents the user from using TypeWithSize with incorrect - // values of N. - typedef void UInt; -}; - -// The specialization for size 4. -template <> -class TypeWithSize<4> { - public: - // unsigned int has size 4 in both gcc and MSVC. - // - // As base/basictypes.h doesn't compile on Windows, we cannot use - // uint32, uint64, and etc here. - typedef int Int; - typedef unsigned int UInt; -}; - -// The specialization for size 8. -template <> -class TypeWithSize<8> { - public: -#if GTEST_OS_WINDOWS - typedef __int64 Int; - typedef unsigned __int64 UInt; -#else - typedef long long Int; // NOLINT - typedef unsigned long long UInt; // NOLINT -#endif // GTEST_OS_WINDOWS -}; - -// Integer types of known sizes. -typedef TypeWithSize<4>::Int Int32; -typedef TypeWithSize<4>::UInt UInt32; -typedef TypeWithSize<8>::Int Int64; -typedef TypeWithSize<8>::UInt UInt64; -typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. - -// Utilities for command line flags and environment variables. - -// Macro for referencing flags. -#define GTEST_FLAG(name) FLAGS_gtest_##name - -// Macros for declaring flags. -#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) -#define GTEST_DECLARE_int32_(name) \ - GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name) -#define GTEST_DECLARE_string_(name) \ - GTEST_API_ extern ::std::string GTEST_FLAG(name) - -// Macros for defining flags. -#define GTEST_DEFINE_bool_(name, default_val, doc) \ - GTEST_API_ bool GTEST_FLAG(name) = (default_val) -#define GTEST_DEFINE_int32_(name, default_val, doc) \ - GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val) -#define GTEST_DEFINE_string_(name, default_val, doc) \ - GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) - -// Thread annotations -#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) -#define GTEST_LOCK_EXCLUDED_(locks) - -// Parses 'str' for a 32-bit signed integer. If successful, writes the result -// to *value and returns true; otherwise leaves *value unchanged and returns -// false. -// TODO(chandlerc): Find a better way to refactor flag and environment parsing -// out of both gtest-port.cc and gtest.cc to avoid exporting this utility -// function. -bool ParseInt32(const Message& src_text, const char* str, Int32* value); - -// Parses a bool/Int32/string from the environment variable -// corresponding to the given Google Test flag. -bool BoolFromGTestEnv(const char* flag, bool default_val); -GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val); -const char* StringFromGTestEnv(const char* flag, const char* default_val); - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ - -#if GTEST_OS_LINUX -# include -# include -# include -# include -#endif // GTEST_OS_LINUX - -#if GTEST_HAS_EXCEPTIONS -# include -#endif - -#include -#include -#include -#include -#include -#include - -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file defines the Message class. -// -// IMPORTANT NOTE: Due to limitation of the C++ language, we have to -// leave some internal implementation details in this header file. -// They are clearly marked by comments like this: -// -// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -// -// Such code is NOT meant to be used by a user directly, and is subject -// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user -// program! - -#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ -#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ - -#include - - -// Ensures that there is at least one operator<< in the global namespace. -// See Message& operator<<(...) below for why. -void operator<<(const testing::internal::Secret&, int); - -namespace testing { - -// The Message class works like an ostream repeater. -// -// Typical usage: -// -// 1. You stream a bunch of values to a Message object. -// It will remember the text in a stringstream. -// 2. Then you stream the Message object to an ostream. -// This causes the text in the Message to be streamed -// to the ostream. -// -// For example; -// -// testing::Message foo; -// foo << 1 << " != " << 2; -// std::cout << foo; -// -// will print "1 != 2". -// -// Message is not intended to be inherited from. In particular, its -// destructor is not virtual. -// -// Note that stringstream behaves differently in gcc and in MSVC. You -// can stream a NULL char pointer to it in the former, but not in the -// latter (it causes an access violation if you do). The Message -// class hides this difference by treating a NULL char pointer as -// "(null)". -class GTEST_API_ Message { - private: - // The type of basic IO manipulators (endl, ends, and flush) for - // narrow streams. - typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&); - - public: - // Constructs an empty Message. - Message(); - - // Copy constructor. - Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT - *ss_ << msg.GetString(); - } - - // Constructs a Message from a C-string. - explicit Message(const char* str) : ss_(new ::std::stringstream) { - *ss_ << str; - } - -#if GTEST_OS_SYMBIAN - // Streams a value (either a pointer or not) to this object. - template - inline Message& operator <<(const T& value) { - StreamHelper(typename internal::is_pointer::type(), value); - return *this; - } -#else - // Streams a non-pointer value to this object. - template - inline Message& operator <<(const T& val) { - // Some libraries overload << for STL containers. These - // overloads are defined in the global namespace instead of ::std. - // - // C++'s symbol lookup rule (i.e. Koenig lookup) says that these - // overloads are visible in either the std namespace or the global - // namespace, but not other namespaces, including the testing - // namespace which Google Test's Message class is in. - // - // To allow STL containers (and other types that has a << operator - // defined in the global namespace) to be used in Google Test - // assertions, testing::Message must access the custom << operator - // from the global namespace. With this using declaration, - // overloads of << defined in the global namespace and those - // visible via Koenig lookup are both exposed in this function. - using ::operator <<; - *ss_ << val; - return *this; - } - - // Streams a pointer value to this object. - // - // This function is an overload of the previous one. When you - // stream a pointer to a Message, this definition will be used as it - // is more specialized. (The C++ Standard, section - // [temp.func.order].) If you stream a non-pointer, then the - // previous definition will be used. - // - // The reason for this overload is that streaming a NULL pointer to - // ostream is undefined behavior. Depending on the compiler, you - // may get "0", "(nil)", "(null)", or an access violation. To - // ensure consistent result across compilers, we always treat NULL - // as "(null)". - template - inline Message& operator <<(T* const& pointer) { // NOLINT - if (pointer == NULL) { - *ss_ << "(null)"; - } else { - *ss_ << pointer; - } - return *this; - } -#endif // GTEST_OS_SYMBIAN - - // Since the basic IO manipulators are overloaded for both narrow - // and wide streams, we have to provide this specialized definition - // of operator <<, even though its body is the same as the - // templatized version above. Without this definition, streaming - // endl or other basic IO manipulators to Message will confuse the - // compiler. - Message& operator <<(BasicNarrowIoManip val) { - *ss_ << val; - return *this; - } - - // Instead of 1/0, we want to see true/false for bool values. - Message& operator <<(bool b) { - return *this << (b ? "true" : "false"); - } - - // These two overloads allow streaming a wide C string to a Message - // using the UTF-8 encoding. - Message& operator <<(const wchar_t* wide_c_str); - Message& operator <<(wchar_t* wide_c_str); - -#if GTEST_HAS_STD_WSTRING - // Converts the given wide string to a narrow string using the UTF-8 - // encoding, and streams the result to this Message object. - Message& operator <<(const ::std::wstring& wstr); -#endif // GTEST_HAS_STD_WSTRING - -#if GTEST_HAS_GLOBAL_WSTRING - // Converts the given wide string to a narrow string using the UTF-8 - // encoding, and streams the result to this Message object. - Message& operator <<(const ::wstring& wstr); -#endif // GTEST_HAS_GLOBAL_WSTRING - - // Gets the text streamed to this object so far as an std::string. - // Each '\0' character in the buffer is replaced with "\\0". - // - // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. - std::string GetString() const; - - private: - -#if GTEST_OS_SYMBIAN - // These are needed as the Nokia Symbian Compiler cannot decide between - // const T& and const T* in a function template. The Nokia compiler _can_ - // decide between class template specializations for T and T*, so a - // tr1::type_traits-like is_pointer works, and we can overload on that. - template - inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) { - if (pointer == NULL) { - *ss_ << "(null)"; - } else { - *ss_ << pointer; - } - } - template - inline void StreamHelper(internal::false_type /*is_pointer*/, - const T& value) { - // See the comments in Message& operator <<(const T&) above for why - // we need this using statement. - using ::operator <<; - *ss_ << value; - } -#endif // GTEST_OS_SYMBIAN - - // We'll hold the text streamed to this object here. - const internal::scoped_ptr< ::std::stringstream> ss_; - - // We declare (but don't implement) this to prevent the compiler - // from implementing the assignment operator. - void operator=(const Message&); -}; - -// Streams a Message to an ostream. -inline std::ostream& operator <<(std::ostream& os, const Message& sb) { - return os << sb.GetString(); -} - -namespace internal { - -// Converts a streamable value to an std::string. A NULL pointer is -// converted to "(null)". When the input value is a ::string, -// ::std::string, ::wstring, or ::std::wstring object, each NUL -// character in it is replaced with "\\0". -template -std::string StreamableToString(const T& streamable) { - return (Message() << streamable).GetString(); -} - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file declares the String class and functions used internally by -// Google Test. They are subject to change without notice. They should not used -// by code external to Google Test. -// -// This header file is #included by . -// It should not be #included by other files. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ - -#ifdef __BORLANDC__ -// string.h is not guaranteed to provide strcpy on C++ Builder. -# include -#endif - -#include -#include - - -namespace testing { -namespace internal { - -// String - an abstract class holding static string utilities. -class GTEST_API_ String { - public: - // Static utility methods - - // Clones a 0-terminated C string, allocating memory using new. The - // caller is responsible for deleting the return value using - // delete[]. Returns the cloned string, or NULL if the input is - // NULL. - // - // This is different from strdup() in string.h, which allocates - // memory using malloc(). - static const char* CloneCString(const char* c_str); - -#if GTEST_OS_WINDOWS_MOBILE - // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be - // able to pass strings to Win32 APIs on CE we need to convert them - // to 'Unicode', UTF-16. - - // Creates a UTF-16 wide string from the given ANSI string, allocating - // memory using new. The caller is responsible for deleting the return - // value using delete[]. Returns the wide string, or NULL if the - // input is NULL. - // - // The wide string is created using the ANSI codepage (CP_ACP) to - // match the behaviour of the ANSI versions of Win32 calls and the - // C runtime. - static LPCWSTR AnsiToUtf16(const char* c_str); - - // Creates an ANSI string from the given wide string, allocating - // memory using new. The caller is responsible for deleting the return - // value using delete[]. Returns the ANSI string, or NULL if the - // input is NULL. - // - // The returned string is created using the ANSI codepage (CP_ACP) to - // match the behaviour of the ANSI versions of Win32 calls and the - // C runtime. - static const char* Utf16ToAnsi(LPCWSTR utf16_str); -#endif - - // Compares two C strings. Returns true iff they have the same content. - // - // Unlike strcmp(), this function can handle NULL argument(s). A - // NULL C string is considered different to any non-NULL C string, - // including the empty string. - static bool CStringEquals(const char* lhs, const char* rhs); - - // Converts a wide C string to a String using the UTF-8 encoding. - // NULL will be converted to "(null)". If an error occurred during - // the conversion, "(failed to convert from wide string)" is - // returned. - static std::string ShowWideCString(const wchar_t* wide_c_str); - - // Compares two wide C strings. Returns true iff they have the same - // content. - // - // Unlike wcscmp(), this function can handle NULL argument(s). A - // NULL C string is considered different to any non-NULL C string, - // including the empty string. - static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); - - // Compares two C strings, ignoring case. Returns true iff they - // have the same content. - // - // Unlike strcasecmp(), this function can handle NULL argument(s). - // A NULL C string is considered different to any non-NULL C string, - // including the empty string. - static bool CaseInsensitiveCStringEquals(const char* lhs, - const char* rhs); - - // Compares two wide C strings, ignoring case. Returns true iff they - // have the same content. - // - // Unlike wcscasecmp(), this function can handle NULL argument(s). - // A NULL C string is considered different to any non-NULL wide C string, - // including the empty string. - // NB: The implementations on different platforms slightly differ. - // On windows, this method uses _wcsicmp which compares according to LC_CTYPE - // environment variable. On GNU platform this method uses wcscasecmp - // which compares according to LC_CTYPE category of the current locale. - // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the - // current locale. - static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, - const wchar_t* rhs); - - // Returns true iff the given string ends with the given suffix, ignoring - // case. Any string is considered to end with an empty suffix. - static bool EndsWithCaseInsensitive( - const std::string& str, const std::string& suffix); - - // Formats an int value as "%02d". - static std::string FormatIntWidth2(int value); // "%02d" for width == 2 - - // Formats an int value as "%X". - static std::string FormatHexInt(int value); - - // Formats a byte as "%02X". - static std::string FormatByte(unsigned char value); - - private: - String(); // Not meant to be instantiated. -}; // class String - -// Gets the content of the stringstream's buffer as an std::string. Each '\0' -// character in the buffer is replaced with "\\0". -GTEST_API_ std::string StringStreamToString(::std::stringstream* stream); - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: keith.ray@gmail.com (Keith Ray) -// -// Google Test filepath utilities -// -// This header file declares classes and functions used internally by -// Google Test. They are subject to change without notice. -// -// This file is #included in . -// Do not include this header file separately! - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ - - -namespace testing { -namespace internal { - -// FilePath - a class for file and directory pathname manipulation which -// handles platform-specific conventions (like the pathname separator). -// Used for helper functions for naming files in a directory for xml output. -// Except for Set methods, all methods are const or static, which provides an -// "immutable value object" -- useful for peace of mind. -// A FilePath with a value ending in a path separator ("like/this/") represents -// a directory, otherwise it is assumed to represent a file. In either case, -// it may or may not represent an actual file or directory in the file system. -// Names are NOT checked for syntax correctness -- no checking for illegal -// characters, malformed paths, etc. - -class GTEST_API_ FilePath { - public: - FilePath() : pathname_("") { } - FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { } - - explicit FilePath(const std::string& pathname) : pathname_(pathname) { - Normalize(); - } - - FilePath& operator=(const FilePath& rhs) { - Set(rhs); - return *this; - } - - void Set(const FilePath& rhs) { - pathname_ = rhs.pathname_; - } - - const std::string& string() const { return pathname_; } - const char* c_str() const { return pathname_.c_str(); } - - // Returns the current working directory, or "" if unsuccessful. - static FilePath GetCurrentDir(); - - // Given directory = "dir", base_name = "test", number = 0, - // extension = "xml", returns "dir/test.xml". If number is greater - // than zero (e.g., 12), returns "dir/test_12.xml". - // On Windows platform, uses \ as the separator rather than /. - static FilePath MakeFileName(const FilePath& directory, - const FilePath& base_name, - int number, - const char* extension); - - // Given directory = "dir", relative_path = "test.xml", - // returns "dir/test.xml". - // On Windows, uses \ as the separator rather than /. - static FilePath ConcatPaths(const FilePath& directory, - const FilePath& relative_path); - - // Returns a pathname for a file that does not currently exist. The pathname - // will be directory/base_name.extension or - // directory/base_name_.extension if directory/base_name.extension - // already exists. The number will be incremented until a pathname is found - // that does not already exist. - // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. - // There could be a race condition if two or more processes are calling this - // function at the same time -- they could both pick the same filename. - static FilePath GenerateUniqueFileName(const FilePath& directory, - const FilePath& base_name, - const char* extension); - - // Returns true iff the path is "". - bool IsEmpty() const { return pathname_.empty(); } - - // If input name has a trailing separator character, removes it and returns - // the name, otherwise return the name string unmodified. - // On Windows platform, uses \ as the separator, other platforms use /. - FilePath RemoveTrailingPathSeparator() const; - - // Returns a copy of the FilePath with the directory part removed. - // Example: FilePath("path/to/file").RemoveDirectoryName() returns - // FilePath("file"). If there is no directory part ("just_a_file"), it returns - // the FilePath unmodified. If there is no file part ("just_a_dir/") it - // returns an empty FilePath (""). - // On Windows platform, '\' is the path separator, otherwise it is '/'. - FilePath RemoveDirectoryName() const; - - // RemoveFileName returns the directory path with the filename removed. - // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". - // If the FilePath is "a_file" or "/a_file", RemoveFileName returns - // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does - // not have a file, like "just/a/dir/", it returns the FilePath unmodified. - // On Windows platform, '\' is the path separator, otherwise it is '/'. - FilePath RemoveFileName() const; - - // Returns a copy of the FilePath with the case-insensitive extension removed. - // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns - // FilePath("dir/file"). If a case-insensitive extension is not - // found, returns a copy of the original FilePath. - FilePath RemoveExtension(const char* extension) const; - - // Creates directories so that path exists. Returns true if successful or if - // the directories already exist; returns false if unable to create - // directories for any reason. Will also return false if the FilePath does - // not represent a directory (that is, it doesn't end with a path separator). - bool CreateDirectoriesRecursively() const; - - // Create the directory so that path exists. Returns true if successful or - // if the directory already exists; returns false if unable to create the - // directory for any reason, including if the parent directory does not - // exist. Not named "CreateDirectory" because that's a macro on Windows. - bool CreateFolder() const; - - // Returns true if FilePath describes something in the file-system, - // either a file, directory, or whatever, and that something exists. - bool FileOrDirectoryExists() const; - - // Returns true if pathname describes a directory in the file-system - // that exists. - bool DirectoryExists() const; - - // Returns true if FilePath ends with a path separator, which indicates that - // it is intended to represent a directory. Returns false otherwise. - // This does NOT check that a directory (or file) actually exists. - bool IsDirectory() const; - - // Returns true if pathname describes a root directory. (Windows has one - // root directory per disk drive.) - bool IsRootDirectory() const; - - // Returns true if pathname describes an absolute path. - bool IsAbsolutePath() const; - - private: - // Replaces multiple consecutive separators with a single separator. - // For example, "bar///foo" becomes "bar/foo". Does not eliminate other - // redundancies that might be in a pathname involving "." or "..". - // - // A pathname with multiple consecutive separators may occur either through - // user error or as a result of some scripts or APIs that generate a pathname - // with a trailing separator. On other platforms the same API or script - // may NOT generate a pathname with a trailing "/". Then elsewhere that - // pathname may have another "/" and pathname components added to it, - // without checking for the separator already being there. - // The script language and operating system may allow paths like "foo//bar" - // but some of the functions in FilePath will not handle that correctly. In - // particular, RemoveTrailingPathSeparator() only removes one separator, and - // it is called in CreateDirectoriesRecursively() assuming that it will change - // a pathname from directory syntax (trailing separator) to filename syntax. - // - // On Windows this method also replaces the alternate path separator '/' with - // the primary path separator '\\', so that for example "bar\\/\\foo" becomes - // "bar\\foo". - - void Normalize(); - - // Returns a pointer to the last occurence of a valid path separator in - // the FilePath. On Windows, for example, both '/' and '\' are valid path - // separators. Returns NULL if no path separator was found. - const char* FindLastPathSeparator() const; - - std::string pathname_; -}; // class FilePath - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ -// This file was GENERATED by command: -// pump.py gtest-type-util.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -// Type utilities needed for implementing typed and type-parameterized -// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// Currently we support at most 50 types in a list, and at most 50 -// type-parameterized tests in one type-parameterized test case. -// Please contact googletestframework@googlegroups.com if you need -// more. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ - - -// #ifdef __GNUC__ is too general here. It is possible to use gcc without using -// libstdc++ (which is where cxxabi.h comes from). -# if GTEST_HAS_CXXABI_H_ -# include -# elif defined(__HP_aCC) -# include -# endif // GTEST_HASH_CXXABI_H_ - -namespace testing { -namespace internal { - -// GetTypeName() returns a human-readable name of type T. -// NB: This function is also used in Google Mock, so don't move it inside of -// the typed-test-only section below. -template -std::string GetTypeName() { -# if GTEST_HAS_RTTI - - const char* const name = typeid(T).name(); -# if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) - int status = 0; - // gcc's implementation of typeid(T).name() mangles the type name, - // so we have to demangle it. -# if GTEST_HAS_CXXABI_H_ - using abi::__cxa_demangle; -# endif // GTEST_HAS_CXXABI_H_ - char* const readable_name = __cxa_demangle(name, 0, 0, &status); - const std::string name_str(status == 0 ? readable_name : name); - free(readable_name); - return name_str; -# else - return name; -# endif // GTEST_HAS_CXXABI_H_ || __HP_aCC - -# else - - return ""; - -# endif // GTEST_HAS_RTTI -} - -#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -// AssertyTypeEq::type is defined iff T1 and T2 are the same -// type. This can be used as a compile-time assertion to ensure that -// two types are equal. - -template -struct AssertTypeEq; - -template -struct AssertTypeEq { - typedef bool type; -}; - -// A unique type used as the default value for the arguments of class -// template Types. This allows us to simulate variadic templates -// (e.g. Types, Type, and etc), which C++ doesn't -// support directly. -struct None {}; - -// The following family of struct and struct templates are used to -// represent type lists. In particular, TypesN -// represents a type list with N types (T1, T2, ..., and TN) in it. -// Except for Types0, every struct in the family has two member types: -// Head for the first type in the list, and Tail for the rest of the -// list. - -// The empty type list. -struct Types0 {}; - -// Type lists of length 1, 2, 3, and so on. - -template -struct Types1 { - typedef T1 Head; - typedef Types0 Tail; -}; -template -struct Types2 { - typedef T1 Head; - typedef Types1 Tail; -}; - -template -struct Types3 { - typedef T1 Head; - typedef Types2 Tail; -}; - -template -struct Types4 { - typedef T1 Head; - typedef Types3 Tail; -}; - -template -struct Types5 { - typedef T1 Head; - typedef Types4 Tail; -}; - -template -struct Types6 { - typedef T1 Head; - typedef Types5 Tail; -}; - -template -struct Types7 { - typedef T1 Head; - typedef Types6 Tail; -}; - -template -struct Types8 { - typedef T1 Head; - typedef Types7 Tail; -}; - -template -struct Types9 { - typedef T1 Head; - typedef Types8 Tail; -}; - -template -struct Types10 { - typedef T1 Head; - typedef Types9 Tail; -}; - -template -struct Types11 { - typedef T1 Head; - typedef Types10 Tail; -}; - -template -struct Types12 { - typedef T1 Head; - typedef Types11 Tail; -}; - -template -struct Types13 { - typedef T1 Head; - typedef Types12 Tail; -}; - -template -struct Types14 { - typedef T1 Head; - typedef Types13 Tail; -}; - -template -struct Types15 { - typedef T1 Head; - typedef Types14 Tail; -}; - -template -struct Types16 { - typedef T1 Head; - typedef Types15 Tail; -}; - -template -struct Types17 { - typedef T1 Head; - typedef Types16 Tail; -}; - -template -struct Types18 { - typedef T1 Head; - typedef Types17 Tail; -}; - -template -struct Types19 { - typedef T1 Head; - typedef Types18 Tail; -}; - -template -struct Types20 { - typedef T1 Head; - typedef Types19 Tail; -}; - -template -struct Types21 { - typedef T1 Head; - typedef Types20 Tail; -}; - -template -struct Types22 { - typedef T1 Head; - typedef Types21 Tail; -}; - -template -struct Types23 { - typedef T1 Head; - typedef Types22 Tail; -}; - -template -struct Types24 { - typedef T1 Head; - typedef Types23 Tail; -}; - -template -struct Types25 { - typedef T1 Head; - typedef Types24 Tail; -}; - -template -struct Types26 { - typedef T1 Head; - typedef Types25 Tail; -}; - -template -struct Types27 { - typedef T1 Head; - typedef Types26 Tail; -}; - -template -struct Types28 { - typedef T1 Head; - typedef Types27 Tail; -}; - -template -struct Types29 { - typedef T1 Head; - typedef Types28 Tail; -}; - -template -struct Types30 { - typedef T1 Head; - typedef Types29 Tail; -}; - -template -struct Types31 { - typedef T1 Head; - typedef Types30 Tail; -}; - -template -struct Types32 { - typedef T1 Head; - typedef Types31 Tail; -}; - -template -struct Types33 { - typedef T1 Head; - typedef Types32 Tail; -}; - -template -struct Types34 { - typedef T1 Head; - typedef Types33 Tail; -}; - -template -struct Types35 { - typedef T1 Head; - typedef Types34 Tail; -}; - -template -struct Types36 { - typedef T1 Head; - typedef Types35 Tail; -}; - -template -struct Types37 { - typedef T1 Head; - typedef Types36 Tail; -}; - -template -struct Types38 { - typedef T1 Head; - typedef Types37 Tail; -}; - -template -struct Types39 { - typedef T1 Head; - typedef Types38 Tail; -}; - -template -struct Types40 { - typedef T1 Head; - typedef Types39 Tail; -}; - -template -struct Types41 { - typedef T1 Head; - typedef Types40 Tail; -}; - -template -struct Types42 { - typedef T1 Head; - typedef Types41 Tail; -}; - -template -struct Types43 { - typedef T1 Head; - typedef Types42 Tail; -}; - -template -struct Types44 { - typedef T1 Head; - typedef Types43 Tail; -}; - -template -struct Types45 { - typedef T1 Head; - typedef Types44 Tail; -}; - -template -struct Types46 { - typedef T1 Head; - typedef Types45 Tail; -}; - -template -struct Types47 { - typedef T1 Head; - typedef Types46 Tail; -}; - -template -struct Types48 { - typedef T1 Head; - typedef Types47 Tail; -}; - -template -struct Types49 { - typedef T1 Head; - typedef Types48 Tail; -}; - -template -struct Types50 { - typedef T1 Head; - typedef Types49 Tail; -}; - - -} // namespace internal - -// We don't want to require the users to write TypesN<...> directly, -// as that would require them to count the length. Types<...> is much -// easier to write, but generates horrible messages when there is a -// compiler error, as gcc insists on printing out each template -// argument, even if it has the default value (this means Types -// will appear as Types in the compiler -// errors). -// -// Our solution is to combine the best part of the two approaches: a -// user would write Types, and Google Test will translate -// that to TypesN internally to make error messages -// readable. The translation is done by the 'type' member of the -// Types template. -template -struct Types { - typedef internal::Types50 type; -}; - -template <> -struct Types { - typedef internal::Types0 type; -}; -template -struct Types { - typedef internal::Types1 type; -}; -template -struct Types { - typedef internal::Types2 type; -}; -template -struct Types { - typedef internal::Types3 type; -}; -template -struct Types { - typedef internal::Types4 type; -}; -template -struct Types { - typedef internal::Types5 type; -}; -template -struct Types { - typedef internal::Types6 type; -}; -template -struct Types { - typedef internal::Types7 type; -}; -template -struct Types { - typedef internal::Types8 type; -}; -template -struct Types { - typedef internal::Types9 type; -}; -template -struct Types { - typedef internal::Types10 type; -}; -template -struct Types { - typedef internal::Types11 type; -}; -template -struct Types { - typedef internal::Types12 type; -}; -template -struct Types { - typedef internal::Types13 type; -}; -template -struct Types { - typedef internal::Types14 type; -}; -template -struct Types { - typedef internal::Types15 type; -}; -template -struct Types { - typedef internal::Types16 type; -}; -template -struct Types { - typedef internal::Types17 type; -}; -template -struct Types { - typedef internal::Types18 type; -}; -template -struct Types { - typedef internal::Types19 type; -}; -template -struct Types { - typedef internal::Types20 type; -}; -template -struct Types { - typedef internal::Types21 type; -}; -template -struct Types { - typedef internal::Types22 type; -}; -template -struct Types { - typedef internal::Types23 type; -}; -template -struct Types { - typedef internal::Types24 type; -}; -template -struct Types { - typedef internal::Types25 type; -}; -template -struct Types { - typedef internal::Types26 type; -}; -template -struct Types { - typedef internal::Types27 type; -}; -template -struct Types { - typedef internal::Types28 type; -}; -template -struct Types { - typedef internal::Types29 type; -}; -template -struct Types { - typedef internal::Types30 type; -}; -template -struct Types { - typedef internal::Types31 type; -}; -template -struct Types { - typedef internal::Types32 type; -}; -template -struct Types { - typedef internal::Types33 type; -}; -template -struct Types { - typedef internal::Types34 type; -}; -template -struct Types { - typedef internal::Types35 type; -}; -template -struct Types { - typedef internal::Types36 type; -}; -template -struct Types { - typedef internal::Types37 type; -}; -template -struct Types { - typedef internal::Types38 type; -}; -template -struct Types { - typedef internal::Types39 type; -}; -template -struct Types { - typedef internal::Types40 type; -}; -template -struct Types { - typedef internal::Types41 type; -}; -template -struct Types { - typedef internal::Types42 type; -}; -template -struct Types { - typedef internal::Types43 type; -}; -template -struct Types { - typedef internal::Types44 type; -}; -template -struct Types { - typedef internal::Types45 type; -}; -template -struct Types { - typedef internal::Types46 type; -}; -template -struct Types { - typedef internal::Types47 type; -}; -template -struct Types { - typedef internal::Types48 type; -}; -template -struct Types { - typedef internal::Types49 type; -}; - -namespace internal { - -# define GTEST_TEMPLATE_ template class - -// The template "selector" struct TemplateSel is used to -// represent Tmpl, which must be a class template with one type -// parameter, as a type. TemplateSel::Bind::type is defined -// as the type Tmpl. This allows us to actually instantiate the -// template "selected" by TemplateSel. -// -// This trick is necessary for simulating typedef for class templates, -// which C++ doesn't support directly. -template -struct TemplateSel { - template - struct Bind { - typedef Tmpl type; - }; -}; - -# define GTEST_BIND_(TmplSel, T) \ - TmplSel::template Bind::type - -// A unique struct template used as the default value for the -// arguments of class template Templates. This allows us to simulate -// variadic templates (e.g. Templates, Templates, -// and etc), which C++ doesn't support directly. -template -struct NoneT {}; - -// The following family of struct and struct templates are used to -// represent template lists. In particular, TemplatesN represents a list of N templates (T1, T2, ..., and TN). Except -// for Templates0, every struct in the family has two member types: -// Head for the selector of the first template in the list, and Tail -// for the rest of the list. - -// The empty template list. -struct Templates0 {}; - -// Template lists of length 1, 2, 3, and so on. - -template -struct Templates1 { - typedef TemplateSel Head; - typedef Templates0 Tail; -}; -template -struct Templates2 { - typedef TemplateSel Head; - typedef Templates1 Tail; -}; - -template -struct Templates3 { - typedef TemplateSel Head; - typedef Templates2 Tail; -}; - -template -struct Templates4 { - typedef TemplateSel Head; - typedef Templates3 Tail; -}; - -template -struct Templates5 { - typedef TemplateSel Head; - typedef Templates4 Tail; -}; - -template -struct Templates6 { - typedef TemplateSel Head; - typedef Templates5 Tail; -}; - -template -struct Templates7 { - typedef TemplateSel Head; - typedef Templates6 Tail; -}; - -template -struct Templates8 { - typedef TemplateSel Head; - typedef Templates7 Tail; -}; - -template -struct Templates9 { - typedef TemplateSel Head; - typedef Templates8 Tail; -}; - -template -struct Templates10 { - typedef TemplateSel Head; - typedef Templates9 Tail; -}; - -template -struct Templates11 { - typedef TemplateSel Head; - typedef Templates10 Tail; -}; - -template -struct Templates12 { - typedef TemplateSel Head; - typedef Templates11 Tail; -}; - -template -struct Templates13 { - typedef TemplateSel Head; - typedef Templates12 Tail; -}; - -template -struct Templates14 { - typedef TemplateSel Head; - typedef Templates13 Tail; -}; - -template -struct Templates15 { - typedef TemplateSel Head; - typedef Templates14 Tail; -}; - -template -struct Templates16 { - typedef TemplateSel Head; - typedef Templates15 Tail; -}; - -template -struct Templates17 { - typedef TemplateSel Head; - typedef Templates16 Tail; -}; - -template -struct Templates18 { - typedef TemplateSel Head; - typedef Templates17 Tail; -}; - -template -struct Templates19 { - typedef TemplateSel Head; - typedef Templates18 Tail; -}; - -template -struct Templates20 { - typedef TemplateSel Head; - typedef Templates19 Tail; -}; - -template -struct Templates21 { - typedef TemplateSel Head; - typedef Templates20 Tail; -}; - -template -struct Templates22 { - typedef TemplateSel Head; - typedef Templates21 Tail; -}; - -template -struct Templates23 { - typedef TemplateSel Head; - typedef Templates22 Tail; -}; - -template -struct Templates24 { - typedef TemplateSel Head; - typedef Templates23 Tail; -}; - -template -struct Templates25 { - typedef TemplateSel Head; - typedef Templates24 Tail; -}; - -template -struct Templates26 { - typedef TemplateSel Head; - typedef Templates25 Tail; -}; - -template -struct Templates27 { - typedef TemplateSel Head; - typedef Templates26 Tail; -}; - -template -struct Templates28 { - typedef TemplateSel Head; - typedef Templates27 Tail; -}; - -template -struct Templates29 { - typedef TemplateSel Head; - typedef Templates28 Tail; -}; - -template -struct Templates30 { - typedef TemplateSel Head; - typedef Templates29 Tail; -}; - -template -struct Templates31 { - typedef TemplateSel Head; - typedef Templates30 Tail; -}; - -template -struct Templates32 { - typedef TemplateSel Head; - typedef Templates31 Tail; -}; - -template -struct Templates33 { - typedef TemplateSel Head; - typedef Templates32 Tail; -}; - -template -struct Templates34 { - typedef TemplateSel Head; - typedef Templates33 Tail; -}; - -template -struct Templates35 { - typedef TemplateSel Head; - typedef Templates34 Tail; -}; - -template -struct Templates36 { - typedef TemplateSel Head; - typedef Templates35 Tail; -}; - -template -struct Templates37 { - typedef TemplateSel Head; - typedef Templates36 Tail; -}; - -template -struct Templates38 { - typedef TemplateSel Head; - typedef Templates37 Tail; -}; - -template -struct Templates39 { - typedef TemplateSel Head; - typedef Templates38 Tail; -}; - -template -struct Templates40 { - typedef TemplateSel Head; - typedef Templates39 Tail; -}; - -template -struct Templates41 { - typedef TemplateSel Head; - typedef Templates40 Tail; -}; - -template -struct Templates42 { - typedef TemplateSel Head; - typedef Templates41 Tail; -}; - -template -struct Templates43 { - typedef TemplateSel Head; - typedef Templates42 Tail; -}; - -template -struct Templates44 { - typedef TemplateSel Head; - typedef Templates43 Tail; -}; - -template -struct Templates45 { - typedef TemplateSel Head; - typedef Templates44 Tail; -}; - -template -struct Templates46 { - typedef TemplateSel Head; - typedef Templates45 Tail; -}; - -template -struct Templates47 { - typedef TemplateSel Head; - typedef Templates46 Tail; -}; - -template -struct Templates48 { - typedef TemplateSel Head; - typedef Templates47 Tail; -}; - -template -struct Templates49 { - typedef TemplateSel Head; - typedef Templates48 Tail; -}; - -template -struct Templates50 { - typedef TemplateSel Head; - typedef Templates49 Tail; -}; - - -// We don't want to require the users to write TemplatesN<...> directly, -// as that would require them to count the length. Templates<...> is much -// easier to write, but generates horrible messages when there is a -// compiler error, as gcc insists on printing out each template -// argument, even if it has the default value (this means Templates -// will appear as Templates in the compiler -// errors). -// -// Our solution is to combine the best part of the two approaches: a -// user would write Templates, and Google Test will translate -// that to TemplatesN internally to make error messages -// readable. The translation is done by the 'type' member of the -// Templates template. -template -struct Templates { - typedef Templates50 type; -}; - -template <> -struct Templates { - typedef Templates0 type; -}; -template -struct Templates { - typedef Templates1 type; -}; -template -struct Templates { - typedef Templates2 type; -}; -template -struct Templates { - typedef Templates3 type; -}; -template -struct Templates { - typedef Templates4 type; -}; -template -struct Templates { - typedef Templates5 type; -}; -template -struct Templates { - typedef Templates6 type; -}; -template -struct Templates { - typedef Templates7 type; -}; -template -struct Templates { - typedef Templates8 type; -}; -template -struct Templates { - typedef Templates9 type; -}; -template -struct Templates { - typedef Templates10 type; -}; -template -struct Templates { - typedef Templates11 type; -}; -template -struct Templates { - typedef Templates12 type; -}; -template -struct Templates { - typedef Templates13 type; -}; -template -struct Templates { - typedef Templates14 type; -}; -template -struct Templates { - typedef Templates15 type; -}; -template -struct Templates { - typedef Templates16 type; -}; -template -struct Templates { - typedef Templates17 type; -}; -template -struct Templates { - typedef Templates18 type; -}; -template -struct Templates { - typedef Templates19 type; -}; -template -struct Templates { - typedef Templates20 type; -}; -template -struct Templates { - typedef Templates21 type; -}; -template -struct Templates { - typedef Templates22 type; -}; -template -struct Templates { - typedef Templates23 type; -}; -template -struct Templates { - typedef Templates24 type; -}; -template -struct Templates { - typedef Templates25 type; -}; -template -struct Templates { - typedef Templates26 type; -}; -template -struct Templates { - typedef Templates27 type; -}; -template -struct Templates { - typedef Templates28 type; -}; -template -struct Templates { - typedef Templates29 type; -}; -template -struct Templates { - typedef Templates30 type; -}; -template -struct Templates { - typedef Templates31 type; -}; -template -struct Templates { - typedef Templates32 type; -}; -template -struct Templates { - typedef Templates33 type; -}; -template -struct Templates { - typedef Templates34 type; -}; -template -struct Templates { - typedef Templates35 type; -}; -template -struct Templates { - typedef Templates36 type; -}; -template -struct Templates { - typedef Templates37 type; -}; -template -struct Templates { - typedef Templates38 type; -}; -template -struct Templates { - typedef Templates39 type; -}; -template -struct Templates { - typedef Templates40 type; -}; -template -struct Templates { - typedef Templates41 type; -}; -template -struct Templates { - typedef Templates42 type; -}; -template -struct Templates { - typedef Templates43 type; -}; -template -struct Templates { - typedef Templates44 type; -}; -template -struct Templates { - typedef Templates45 type; -}; -template -struct Templates { - typedef Templates46 type; -}; -template -struct Templates { - typedef Templates47 type; -}; -template -struct Templates { - typedef Templates48 type; -}; -template -struct Templates { - typedef Templates49 type; -}; - -// The TypeList template makes it possible to use either a single type -// or a Types<...> list in TYPED_TEST_CASE() and -// INSTANTIATE_TYPED_TEST_CASE_P(). - -template -struct TypeList { - typedef Types1 type; -}; - -template -struct TypeList > { - typedef typename Types::type type; -}; - -#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ - -// Due to C++ preprocessor weirdness, we need double indirection to -// concatenate two tokens when one of them is __LINE__. Writing -// -// foo ## __LINE__ -// -// will result in the token foo__LINE__, instead of foo followed by -// the current line number. For more details, see -// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 -#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) -#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar - -class ProtocolMessage; -namespace proto2 { class Message; } - -namespace testing { - -// Forward declarations. - -class AssertionResult; // Result of an assertion. -class Message; // Represents a failure message. -class Test; // Represents a test. -class TestInfo; // Information about a test. -class TestPartResult; // Result of a test part. -class UnitTest; // A collection of test cases. - -template -::std::string PrintToString(const T& value); - -namespace internal { - -struct TraceInfo; // Information about a trace point. -class ScopedTrace; // Implements scoped trace. -class TestInfoImpl; // Opaque implementation of TestInfo -class UnitTestImpl; // Opaque implementation of UnitTest - -// How many times InitGoogleTest() has been called. -GTEST_API_ extern int g_init_gtest_count; - -// The text used in failure messages to indicate the start of the -// stack trace. -GTEST_API_ extern const char kStackTraceMarker[]; - -// Two overloaded helpers for checking at compile time whether an -// expression is a null pointer literal (i.e. NULL or any 0-valued -// compile-time integral constant). Their return values have -// different sizes, so we can use sizeof() to test which version is -// picked by the compiler. These helpers have no implementations, as -// we only need their signatures. -// -// Given IsNullLiteralHelper(x), the compiler will pick the first -// version if x can be implicitly converted to Secret*, and pick the -// second version otherwise. Since Secret is a secret and incomplete -// type, the only expression a user can write that has type Secret* is -// a null pointer literal. Therefore, we know that x is a null -// pointer literal if and only if the first version is picked by the -// compiler. -char IsNullLiteralHelper(Secret* p); -char (&IsNullLiteralHelper(...))[2]; // NOLINT - -// A compile-time bool constant that is true if and only if x is a -// null pointer literal (i.e. NULL or any 0-valued compile-time -// integral constant). -#ifdef GTEST_ELLIPSIS_NEEDS_POD_ -// We lose support for NULL detection where the compiler doesn't like -// passing non-POD classes through ellipsis (...). -# define GTEST_IS_NULL_LITERAL_(x) false -#else -# define GTEST_IS_NULL_LITERAL_(x) \ - (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1) -#endif // GTEST_ELLIPSIS_NEEDS_POD_ - -// Appends the user-supplied message to the Google-Test-generated message. -GTEST_API_ std::string AppendUserMessage( - const std::string& gtest_msg, const Message& user_msg); - -#if GTEST_HAS_EXCEPTIONS - -// This exception is thrown by (and only by) a failed Google Test -// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions -// are enabled). We derive it from std::runtime_error, which is for -// errors presumably detectable only at run time. Since -// std::runtime_error inherits from std::exception, many testing -// frameworks know how to extract and print the message inside it. -class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { - public: - explicit GoogleTestFailureException(const TestPartResult& failure); -}; - -#endif // GTEST_HAS_EXCEPTIONS - -// A helper class for creating scoped traces in user programs. -class GTEST_API_ ScopedTrace { - public: - // The c'tor pushes the given source file location and message onto - // a trace stack maintained by Google Test. - ScopedTrace(const char* file, int line, const Message& message); - - // The d'tor pops the info pushed by the c'tor. - // - // Note that the d'tor is not virtual in order to be efficient. - // Don't inherit from ScopedTrace! - ~ScopedTrace(); - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); -} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its - // c'tor and d'tor. Therefore it doesn't - // need to be used otherwise. - -// Constructs and returns the message for an equality assertion -// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. -// -// The first four parameters are the expressions used in the assertion -// and their values, as strings. For example, for ASSERT_EQ(foo, bar) -// where foo is 5 and bar is 6, we have: -// -// expected_expression: "foo" -// actual_expression: "bar" -// expected_value: "5" -// actual_value: "6" -// -// The ignoring_case parameter is true iff the assertion is a -// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will -// be inserted into the message. -GTEST_API_ AssertionResult EqFailure(const char* expected_expression, - const char* actual_expression, - const std::string& expected_value, - const std::string& actual_value, - bool ignoring_case); - -// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. -GTEST_API_ std::string GetBoolAssertionFailureMessage( - const AssertionResult& assertion_result, - const char* expression_text, - const char* actual_predicate_value, - const char* expected_predicate_value); - -// This template class represents an IEEE floating-point number -// (either single-precision or double-precision, depending on the -// template parameters). -// -// The purpose of this class is to do more sophisticated number -// comparison. (Due to round-off error, etc, it's very unlikely that -// two floating-points will be equal exactly. Hence a naive -// comparison by the == operation often doesn't work.) -// -// Format of IEEE floating-point: -// -// The most-significant bit being the leftmost, an IEEE -// floating-point looks like -// -// sign_bit exponent_bits fraction_bits -// -// Here, sign_bit is a single bit that designates the sign of the -// number. -// -// For float, there are 8 exponent bits and 23 fraction bits. -// -// For double, there are 11 exponent bits and 52 fraction bits. -// -// More details can be found at -// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. -// -// Template parameter: -// -// RawType: the raw floating-point type (either float or double) -template -class FloatingPoint { - public: - // Defines the unsigned integer type that has the same size as the - // floating point number. - typedef typename TypeWithSize::UInt Bits; - - // Constants. - - // # of bits in a number. - static const size_t kBitCount = 8*sizeof(RawType); - - // # of fraction bits in a number. - static const size_t kFractionBitCount = - std::numeric_limits::digits - 1; - - // # of exponent bits in a number. - static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; - - // The mask for the sign bit. - static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); - - // The mask for the fraction bits. - static const Bits kFractionBitMask = - ~static_cast(0) >> (kExponentBitCount + 1); - - // The mask for the exponent bits. - static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); - - // How many ULP's (Units in the Last Place) we want to tolerate when - // comparing two numbers. The larger the value, the more error we - // allow. A 0 value means that two numbers must be exactly the same - // to be considered equal. - // - // The maximum error of a single floating-point operation is 0.5 - // units in the last place. On Intel CPU's, all floating-point - // calculations are done with 80-bit precision, while double has 64 - // bits. Therefore, 4 should be enough for ordinary use. - // - // See the following article for more details on ULP: - // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ - static const size_t kMaxUlps = 4; - - // Constructs a FloatingPoint from a raw floating-point number. - // - // On an Intel CPU, passing a non-normalized NAN (Not a Number) - // around may change its bits, although the new value is guaranteed - // to be also a NAN. Therefore, don't expect this constructor to - // preserve the bits in x when x is a NAN. - explicit FloatingPoint(const RawType& x) { u_.value_ = x; } - - // Static methods - - // Reinterprets a bit pattern as a floating-point number. - // - // This function is needed to test the AlmostEquals() method. - static RawType ReinterpretBits(const Bits bits) { - FloatingPoint fp(0); - fp.u_.bits_ = bits; - return fp.u_.value_; - } - - // Returns the floating-point number that represent positive infinity. - static RawType Infinity() { - return ReinterpretBits(kExponentBitMask); - } - - // Returns the maximum representable finite floating-point number. - static RawType Max(); - - // Non-static methods - - // Returns the bits that represents this number. - const Bits &bits() const { return u_.bits_; } - - // Returns the exponent bits of this number. - Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } - - // Returns the fraction bits of this number. - Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } - - // Returns the sign bit of this number. - Bits sign_bit() const { return kSignBitMask & u_.bits_; } - - // Returns true iff this is NAN (not a number). - bool is_nan() const { - // It's a NAN if the exponent bits are all ones and the fraction - // bits are not entirely zeros. - return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); - } - - // Returns true iff this number is at most kMaxUlps ULP's away from - // rhs. In particular, this function: - // - // - returns false if either number is (or both are) NAN. - // - treats really large numbers as almost equal to infinity. - // - thinks +0.0 and -0.0 are 0 DLP's apart. - bool AlmostEquals(const FloatingPoint& rhs) const { - // The IEEE standard says that any comparison operation involving - // a NAN must return false. - if (is_nan() || rhs.is_nan()) return false; - - return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) - <= kMaxUlps; - } - - private: - // The data type used to store the actual floating-point number. - union FloatingPointUnion { - RawType value_; // The raw floating-point number. - Bits bits_; // The bits that represent the number. - }; - - // Converts an integer from the sign-and-magnitude representation to - // the biased representation. More precisely, let N be 2 to the - // power of (kBitCount - 1), an integer x is represented by the - // unsigned number x + N. - // - // For instance, - // - // -N + 1 (the most negative number representable using - // sign-and-magnitude) is represented by 1; - // 0 is represented by N; and - // N - 1 (the biggest number representable using - // sign-and-magnitude) is represented by 2N - 1. - // - // Read http://en.wikipedia.org/wiki/Signed_number_representations - // for more details on signed number representations. - static Bits SignAndMagnitudeToBiased(const Bits &sam) { - if (kSignBitMask & sam) { - // sam represents a negative number. - return ~sam + 1; - } else { - // sam represents a positive number. - return kSignBitMask | sam; - } - } - - // Given two numbers in the sign-and-magnitude representation, - // returns the distance between them as an unsigned number. - static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, - const Bits &sam2) { - const Bits biased1 = SignAndMagnitudeToBiased(sam1); - const Bits biased2 = SignAndMagnitudeToBiased(sam2); - return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); - } - - FloatingPointUnion u_; -}; - -// We cannot use std::numeric_limits::max() as it clashes with the max() -// macro defined by . -template <> -inline float FloatingPoint::Max() { return FLT_MAX; } -template <> -inline double FloatingPoint::Max() { return DBL_MAX; } - -// Typedefs the instances of the FloatingPoint template class that we -// care to use. -typedef FloatingPoint Float; -typedef FloatingPoint Double; - -// In order to catch the mistake of putting tests that use different -// test fixture classes in the same test case, we need to assign -// unique IDs to fixture classes and compare them. The TypeId type is -// used to hold such IDs. The user should treat TypeId as an opaque -// type: the only operation allowed on TypeId values is to compare -// them for equality using the == operator. -typedef const void* TypeId; - -template -class TypeIdHelper { - public: - // dummy_ must not have a const type. Otherwise an overly eager - // compiler (e.g. MSVC 7.1 & 8.0) may try to merge - // TypeIdHelper::dummy_ for different Ts as an "optimization". - static bool dummy_; -}; - -template -bool TypeIdHelper::dummy_ = false; - -// GetTypeId() returns the ID of type T. Different values will be -// returned for different types. Calling the function twice with the -// same type argument is guaranteed to return the same ID. -template -TypeId GetTypeId() { - // The compiler is required to allocate a different - // TypeIdHelper::dummy_ variable for each T used to instantiate - // the template. Therefore, the address of dummy_ is guaranteed to - // be unique. - return &(TypeIdHelper::dummy_); -} - -// Returns the type ID of ::testing::Test. Always call this instead -// of GetTypeId< ::testing::Test>() to get the type ID of -// ::testing::Test, as the latter may give the wrong result due to a -// suspected linker bug when compiling Google Test as a Mac OS X -// framework. -GTEST_API_ TypeId GetTestTypeId(); - -// Defines the abstract factory interface that creates instances -// of a Test object. -class TestFactoryBase { - public: - virtual ~TestFactoryBase() {} - - // Creates a test instance to run. The instance is both created and destroyed - // within TestInfoImpl::Run() - virtual Test* CreateTest() = 0; - - protected: - TestFactoryBase() {} - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); -}; - -// This class provides implementation of TeastFactoryBase interface. -// It is used in TEST and TEST_F macros. -template -class TestFactoryImpl : public TestFactoryBase { - public: - virtual Test* CreateTest() { return new TestClass; } -}; - -#if GTEST_OS_WINDOWS - -// Predicate-formatters for implementing the HRESULT checking macros -// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} -// We pass a long instead of HRESULT to avoid causing an -// include dependency for the HRESULT type. -GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr, - long hr); // NOLINT -GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, - long hr); // NOLINT - -#endif // GTEST_OS_WINDOWS - -// Types of SetUpTestCase() and TearDownTestCase() functions. -typedef void (*SetUpTestCaseFunc)(); -typedef void (*TearDownTestCaseFunc)(); - -// Creates a new TestInfo object and registers it with Google Test; -// returns the created object. -// -// Arguments: -// -// test_case_name: name of the test case -// name: name of the test -// type_param the name of the test's type parameter, or NULL if -// this is not a typed or a type-parameterized test. -// value_param text representation of the test's value parameter, -// or NULL if this is not a type-parameterized test. -// fixture_class_id: ID of the test fixture class -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -// factory: pointer to the factory that creates a test object. -// The newly created TestInfo instance will assume -// ownership of the factory object. -GTEST_API_ TestInfo* MakeAndRegisterTestInfo( - const char* test_case_name, - const char* name, - const char* type_param, - const char* value_param, - TypeId fixture_class_id, - SetUpTestCaseFunc set_up_tc, - TearDownTestCaseFunc tear_down_tc, - TestFactoryBase* factory); - -// If *pstr starts with the given prefix, modifies *pstr to be right -// past the prefix and returns true; otherwise leaves *pstr unchanged -// and returns false. None of pstr, *pstr, and prefix can be NULL. -GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); - -#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -// State of the definition of a type-parameterized test case. -class GTEST_API_ TypedTestCasePState { - public: - TypedTestCasePState() : registered_(false) {} - - // Adds the given test name to defined_test_names_ and return true - // if the test case hasn't been registered; otherwise aborts the - // program. - bool AddTestName(const char* file, int line, const char* case_name, - const char* test_name) { - if (registered_) { - fprintf(stderr, "%s Test %s must be defined before " - "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n", - FormatFileLocation(file, line).c_str(), test_name, case_name); - fflush(stderr); - posix::Abort(); - } - defined_test_names_.insert(test_name); - return true; - } - - // Verifies that registered_tests match the test names in - // defined_test_names_; returns registered_tests if successful, or - // aborts the program otherwise. - const char* VerifyRegisteredTestNames( - const char* file, int line, const char* registered_tests); - - private: - bool registered_; - ::std::set defined_test_names_; -}; - -// Skips to the first non-space char after the first comma in 'str'; -// returns NULL if no comma is found in 'str'. -inline const char* SkipComma(const char* str) { - const char* comma = strchr(str, ','); - if (comma == NULL) { - return NULL; - } - while (IsSpace(*(++comma))) {} - return comma; -} - -// Returns the prefix of 'str' before the first comma in it; returns -// the entire string if it contains no comma. -inline std::string GetPrefixUntilComma(const char* str) { - const char* comma = strchr(str, ','); - return comma == NULL ? str : std::string(str, comma); -} - -// TypeParameterizedTest::Register() -// registers a list of type-parameterized tests with Google Test. The -// return value is insignificant - we just need to return something -// such that we can call this function in a namespace scope. -// -// Implementation note: The GTEST_TEMPLATE_ macro declares a template -// template parameter. It's defined in gtest-type-util.h. -template -class TypeParameterizedTest { - public: - // 'index' is the index of the test in the type list 'Types' - // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase, - // Types). Valid values for 'index' are [0, N - 1] where N is the - // length of Types. - static bool Register(const char* prefix, const char* case_name, - const char* test_names, int index) { - typedef typename Types::Head Type; - typedef Fixture FixtureClass; - typedef typename GTEST_BIND_(TestSel, Type) TestClass; - - // First, registers the first type-parameterized test in the type - // list. - MakeAndRegisterTestInfo( - (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/" - + StreamableToString(index)).c_str(), - GetPrefixUntilComma(test_names).c_str(), - GetTypeName().c_str(), - NULL, // No value parameter. - GetTypeId(), - TestClass::SetUpTestCase, - TestClass::TearDownTestCase, - new TestFactoryImpl); - - // Next, recurses (at compile time) with the tail of the type list. - return TypeParameterizedTest - ::Register(prefix, case_name, test_names, index + 1); - } -}; - -// The base case for the compile time recursion. -template -class TypeParameterizedTest { - public: - static bool Register(const char* /*prefix*/, const char* /*case_name*/, - const char* /*test_names*/, int /*index*/) { - return true; - } -}; - -// TypeParameterizedTestCase::Register() -// registers *all combinations* of 'Tests' and 'Types' with Google -// Test. The return value is insignificant - we just need to return -// something such that we can call this function in a namespace scope. -template -class TypeParameterizedTestCase { - public: - static bool Register(const char* prefix, const char* case_name, - const char* test_names) { - typedef typename Tests::Head Head; - - // First, register the first test in 'Test' for each type in 'Types'. - TypeParameterizedTest::Register( - prefix, case_name, test_names, 0); - - // Next, recurses (at compile time) with the tail of the test list. - return TypeParameterizedTestCase - ::Register(prefix, case_name, SkipComma(test_names)); - } -}; - -// The base case for the compile time recursion. -template -class TypeParameterizedTestCase { - public: - static bool Register(const char* /*prefix*/, const char* /*case_name*/, - const char* /*test_names*/) { - return true; - } -}; - -#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -// Returns the current OS stack trace as an std::string. -// -// The maximum number of stack frames to be included is specified by -// the gtest_stack_trace_depth flag. The skip_count parameter -// specifies the number of top frames to be skipped, which doesn't -// count against the number of frames to be included. -// -// For example, if Foo() calls Bar(), which in turn calls -// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in -// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. -GTEST_API_ std::string GetCurrentOsStackTraceExceptTop( - UnitTest* unit_test, int skip_count); - -// Helpers for suppressing warnings on unreachable code or constant -// condition. - -// Always returns true. -GTEST_API_ bool AlwaysTrue(); - -// Always returns false. -inline bool AlwaysFalse() { return !AlwaysTrue(); } - -// Helper for suppressing false warning from Clang on a const char* -// variable declared in a conditional expression always being NULL in -// the else branch. -struct GTEST_API_ ConstCharPtr { - ConstCharPtr(const char* str) : value(str) {} - operator bool() const { return true; } - const char* value; -}; - -// A simple Linear Congruential Generator for generating random -// numbers with a uniform distribution. Unlike rand() and srand(), it -// doesn't use global state (and therefore can't interfere with user -// code). Unlike rand_r(), it's portable. An LCG isn't very random, -// but it's good enough for our purposes. -class GTEST_API_ Random { - public: - static const UInt32 kMaxRange = 1u << 31; - - explicit Random(UInt32 seed) : state_(seed) {} - - void Reseed(UInt32 seed) { state_ = seed; } - - // Generates a random number from [0, range). Crashes if 'range' is - // 0 or greater than kMaxRange. - UInt32 Generate(UInt32 range); - - private: - UInt32 state_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); -}; - -// Defining a variable of type CompileAssertTypesEqual will cause a -// compiler error iff T1 and T2 are different types. -template -struct CompileAssertTypesEqual; - -template -struct CompileAssertTypesEqual { -}; - -// Removes the reference from a type if it is a reference type, -// otherwise leaves it unchanged. This is the same as -// tr1::remove_reference, which is not widely available yet. -template -struct RemoveReference { typedef T type; }; // NOLINT -template -struct RemoveReference { typedef T type; }; // NOLINT - -// A handy wrapper around RemoveReference that works when the argument -// T depends on template parameters. -#define GTEST_REMOVE_REFERENCE_(T) \ - typename ::testing::internal::RemoveReference::type - -// Removes const from a type if it is a const type, otherwise leaves -// it unchanged. This is the same as tr1::remove_const, which is not -// widely available yet. -template -struct RemoveConst { typedef T type; }; // NOLINT -template -struct RemoveConst { typedef T type; }; // NOLINT - -// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above -// definition to fail to remove the const in 'const int[3]' and 'const -// char[3][4]'. The following specialization works around the bug. -template -struct RemoveConst { - typedef typename RemoveConst::type type[N]; -}; - -#if defined(_MSC_VER) && _MSC_VER < 1400 -// This is the only specialization that allows VC++ 7.1 to remove const in -// 'const int[3] and 'const int[3][4]'. However, it causes trouble with GCC -// and thus needs to be conditionally compiled. -template -struct RemoveConst { - typedef typename RemoveConst::type type[N]; -}; -#endif - -// A handy wrapper around RemoveConst that works when the argument -// T depends on template parameters. -#define GTEST_REMOVE_CONST_(T) \ - typename ::testing::internal::RemoveConst::type - -// Turns const U&, U&, const U, and U all into U. -#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ - GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T)) - -// Adds reference to a type if it is not a reference type, -// otherwise leaves it unchanged. This is the same as -// tr1::add_reference, which is not widely available yet. -template -struct AddReference { typedef T& type; }; // NOLINT -template -struct AddReference { typedef T& type; }; // NOLINT - -// A handy wrapper around AddReference that works when the argument T -// depends on template parameters. -#define GTEST_ADD_REFERENCE_(T) \ - typename ::testing::internal::AddReference::type - -// Adds a reference to const on top of T as necessary. For example, -// it transforms -// -// char ==> const char& -// const char ==> const char& -// char& ==> const char& -// const char& ==> const char& -// -// The argument T must depend on some template parameters. -#define GTEST_REFERENCE_TO_CONST_(T) \ - GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T)) - -// ImplicitlyConvertible::value is a compile-time bool -// constant that's true iff type From can be implicitly converted to -// type To. -template -class ImplicitlyConvertible { - private: - // We need the following helper functions only for their types. - // They have no implementations. - - // MakeFrom() is an expression whose type is From. We cannot simply - // use From(), as the type From may not have a public default - // constructor. - static From MakeFrom(); - - // These two functions are overloaded. Given an expression - // Helper(x), the compiler will pick the first version if x can be - // implicitly converted to type To; otherwise it will pick the - // second version. - // - // The first version returns a value of size 1, and the second - // version returns a value of size 2. Therefore, by checking the - // size of Helper(x), which can be done at compile time, we can tell - // which version of Helper() is used, and hence whether x can be - // implicitly converted to type To. - static char Helper(To); - static char (&Helper(...))[2]; // NOLINT - - // We have to put the 'public' section after the 'private' section, - // or MSVC refuses to compile the code. - public: - // MSVC warns about implicitly converting from double to int for - // possible loss of data, so we need to temporarily disable the - // warning. -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4244) // Temporarily disables warning 4244. - - static const bool value = - sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; -# pragma warning(pop) // Restores the warning state. -#elif defined(__BORLANDC__) - // C++Builder cannot use member overload resolution during template - // instantiation. The simplest workaround is to use its C++0x type traits - // functions (C++Builder 2009 and above only). - static const bool value = __is_convertible(From, To); -#else - static const bool value = - sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; -#endif // _MSV_VER -}; -template -const bool ImplicitlyConvertible::value; - -// IsAProtocolMessage::value is a compile-time bool constant that's -// true iff T is type ProtocolMessage, proto2::Message, or a subclass -// of those. -template -struct IsAProtocolMessage - : public bool_constant< - ImplicitlyConvertible::value || - ImplicitlyConvertible::value> { -}; - -// When the compiler sees expression IsContainerTest(0), if C is an -// STL-style container class, the first overload of IsContainerTest -// will be viable (since both C::iterator* and C::const_iterator* are -// valid types and NULL can be implicitly converted to them). It will -// be picked over the second overload as 'int' is a perfect match for -// the type of argument 0. If C::iterator or C::const_iterator is not -// a valid type, the first overload is not viable, and the second -// overload will be picked. Therefore, we can determine whether C is -// a container class by checking the type of IsContainerTest(0). -// The value of the expression is insignificant. -// -// Note that we look for both C::iterator and C::const_iterator. The -// reason is that C++ injects the name of a class as a member of the -// class itself (e.g. you can refer to class iterator as either -// 'iterator' or 'iterator::iterator'). If we look for C::iterator -// only, for example, we would mistakenly think that a class named -// iterator is an STL container. -// -// Also note that the simpler approach of overloading -// IsContainerTest(typename C::const_iterator*) and -// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. -typedef int IsContainer; -template -IsContainer IsContainerTest(int /* dummy */, - typename C::iterator* /* it */ = NULL, - typename C::const_iterator* /* const_it */ = NULL) { - return 0; -} - -typedef char IsNotContainer; -template -IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } - -// EnableIf::type is void when 'Cond' is true, and -// undefined when 'Cond' is false. To use SFINAE to make a function -// overload only apply when a particular expression is true, add -// "typename EnableIf::type* = 0" as the last parameter. -template struct EnableIf; -template<> struct EnableIf { typedef void type; }; // NOLINT - -// Utilities for native arrays. - -// ArrayEq() compares two k-dimensional native arrays using the -// elements' operator==, where k can be any integer >= 0. When k is -// 0, ArrayEq() degenerates into comparing a single pair of values. - -template -bool ArrayEq(const T* lhs, size_t size, const U* rhs); - -// This generic version is used when k is 0. -template -inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; } - -// This overload is used when k >= 1. -template -inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { - return internal::ArrayEq(lhs, N, rhs); -} - -// This helper reduces code bloat. If we instead put its logic inside -// the previous ArrayEq() function, arrays with different sizes would -// lead to different copies of the template code. -template -bool ArrayEq(const T* lhs, size_t size, const U* rhs) { - for (size_t i = 0; i != size; i++) { - if (!internal::ArrayEq(lhs[i], rhs[i])) - return false; - } - return true; -} - -// Finds the first element in the iterator range [begin, end) that -// equals elem. Element may be a native array type itself. -template -Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { - for (Iter it = begin; it != end; ++it) { - if (internal::ArrayEq(*it, elem)) - return it; - } - return end; -} - -// CopyArray() copies a k-dimensional native array using the elements' -// operator=, where k can be any integer >= 0. When k is 0, -// CopyArray() degenerates into copying a single value. - -template -void CopyArray(const T* from, size_t size, U* to); - -// This generic version is used when k is 0. -template -inline void CopyArray(const T& from, U* to) { *to = from; } - -// This overload is used when k >= 1. -template -inline void CopyArray(const T(&from)[N], U(*to)[N]) { - internal::CopyArray(from, N, *to); -} - -// This helper reduces code bloat. If we instead put its logic inside -// the previous CopyArray() function, arrays with different sizes -// would lead to different copies of the template code. -template -void CopyArray(const T* from, size_t size, U* to) { - for (size_t i = 0; i != size; i++) { - internal::CopyArray(from[i], to + i); - } -} - -// The relation between an NativeArray object (see below) and the -// native array it represents. -enum RelationToSource { - kReference, // The NativeArray references the native array. - kCopy // The NativeArray makes a copy of the native array and - // owns the copy. -}; - -// Adapts a native array to a read-only STL-style container. Instead -// of the complete STL container concept, this adaptor only implements -// members useful for Google Mock's container matchers. New members -// should be added as needed. To simplify the implementation, we only -// support Element being a raw type (i.e. having no top-level const or -// reference modifier). It's the client's responsibility to satisfy -// this requirement. Element can be an array type itself (hence -// multi-dimensional arrays are supported). -template -class NativeArray { - public: - // STL-style container typedefs. - typedef Element value_type; - typedef Element* iterator; - typedef const Element* const_iterator; - - // Constructs from a native array. - NativeArray(const Element* array, size_t count, RelationToSource relation) { - Init(array, count, relation); - } - - // Copy constructor. - NativeArray(const NativeArray& rhs) { - Init(rhs.array_, rhs.size_, rhs.relation_to_source_); - } - - ~NativeArray() { - // Ensures that the user doesn't instantiate NativeArray with a - // const or reference type. - static_cast(StaticAssertTypeEqHelper()); - if (relation_to_source_ == kCopy) - delete[] array_; - } - - // STL-style container methods. - size_t size() const { return size_; } - const_iterator begin() const { return array_; } - const_iterator end() const { return array_ + size_; } - bool operator==(const NativeArray& rhs) const { - return size() == rhs.size() && - ArrayEq(begin(), size(), rhs.begin()); - } - - private: - // Initializes this object; makes a copy of the input array if - // 'relation' is kCopy. - void Init(const Element* array, size_t a_size, RelationToSource relation) { - if (relation == kReference) { - array_ = array; - } else { - Element* const copy = new Element[a_size]; - CopyArray(array, a_size, copy); - array_ = copy; - } - size_ = a_size; - relation_to_source_ = relation; - } - - const Element* array_; - size_t size_; - RelationToSource relation_to_source_; - - GTEST_DISALLOW_ASSIGN_(NativeArray); -}; - -} // namespace internal -} // namespace testing - -#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ - ::testing::internal::AssertHelper(result_type, file, line, message) \ - = ::testing::Message() - -#define GTEST_MESSAGE_(message, result_type) \ - GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) - -#define GTEST_FATAL_FAILURE_(message) \ - return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) - -#define GTEST_NONFATAL_FAILURE_(message) \ - GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) - -#define GTEST_SUCCESS_(message) \ - GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) - -// Suppresses MSVC warnings 4072 (unreachable code) for the code following -// statement if it returns or throws (or doesn't return or throw in some -// situations). -#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ - if (::testing::internal::AlwaysTrue()) { statement; } - -#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::ConstCharPtr gtest_msg = "") { \ - bool gtest_caught_expected = false; \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (expected_exception const&) { \ - gtest_caught_expected = true; \ - } \ - catch (...) { \ - gtest_msg.value = \ - "Expected: " #statement " throws an exception of type " \ - #expected_exception ".\n Actual: it throws a different type."; \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ - } \ - if (!gtest_caught_expected) { \ - gtest_msg.value = \ - "Expected: " #statement " throws an exception of type " \ - #expected_exception ".\n Actual: it throws nothing."; \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \ - fail(gtest_msg.value) - -#define GTEST_TEST_NO_THROW_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (...) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ - fail("Expected: " #statement " doesn't throw an exception.\n" \ - " Actual: it throws.") - -#define GTEST_TEST_ANY_THROW_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - bool gtest_caught_any = false; \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (...) { \ - gtest_caught_any = true; \ - } \ - if (!gtest_caught_any) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \ - fail("Expected: " #statement " throws an exception.\n" \ - " Actual: it doesn't.") - - -// Implements Boolean test assertions such as EXPECT_TRUE. expression can be -// either a boolean expression or an AssertionResult. text is a textual -// represenation of expression as it was passed into the EXPECT_TRUE. -#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (const ::testing::AssertionResult gtest_ar_ = \ - ::testing::AssertionResult(expression)) \ - ; \ - else \ - fail(::testing::internal::GetBoolAssertionFailureMessage(\ - gtest_ar_, text, #actual, #expected).c_str()) - -#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \ - fail("Expected: " #statement " doesn't generate new fatal " \ - "failures in the current thread.\n" \ - " Actual: it does.") - -// Expands to the name of the class that implements the given test. -#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ - test_case_name##_##test_name##_Test - -// Helper macro for defining tests. -#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\ -class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ - public:\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ - private:\ - virtual void TestBody();\ - static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ - GTEST_DISALLOW_COPY_AND_ASSIGN_(\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ -};\ -\ -::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\ - ::test_info_ =\ - ::testing::internal::MakeAndRegisterTestInfo(\ - #test_case_name, #test_name, NULL, NULL, \ - (parent_id), \ - parent_class::SetUpTestCase, \ - parent_class::TearDownTestCase, \ - new ::testing::internal::TestFactoryImpl<\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ -void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file defines the public API for death tests. It is -// #included by gtest.h so a user doesn't need to include this -// directly. - -#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ - -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file defines internal utilities needed for implementing -// death tests. They are subject to change without notice. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ - - -#include - -namespace testing { -namespace internal { - -GTEST_DECLARE_string_(internal_run_death_test); - -// Names of the flags (needed for parsing Google Test flags). -const char kDeathTestStyleFlag[] = "death_test_style"; -const char kDeathTestUseFork[] = "death_test_use_fork"; -const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; - -#if GTEST_HAS_DEATH_TEST - -// DeathTest is a class that hides much of the complexity of the -// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method -// returns a concrete class that depends on the prevailing death test -// style, as defined by the --gtest_death_test_style and/or -// --gtest_internal_run_death_test flags. - -// In describing the results of death tests, these terms are used with -// the corresponding definitions: -// -// exit status: The integer exit information in the format specified -// by wait(2) -// exit code: The integer code passed to exit(3), _exit(2), or -// returned from main() -class GTEST_API_ DeathTest { - public: - // Create returns false if there was an error determining the - // appropriate action to take for the current death test; for example, - // if the gtest_death_test_style flag is set to an invalid value. - // The LastMessage method will return a more detailed message in that - // case. Otherwise, the DeathTest pointer pointed to by the "test" - // argument is set. If the death test should be skipped, the pointer - // is set to NULL; otherwise, it is set to the address of a new concrete - // DeathTest object that controls the execution of the current test. - static bool Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test); - DeathTest(); - virtual ~DeathTest() { } - - // A helper class that aborts a death test when it's deleted. - class ReturnSentinel { - public: - explicit ReturnSentinel(DeathTest* test) : test_(test) { } - ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } - private: - DeathTest* const test_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); - } GTEST_ATTRIBUTE_UNUSED_; - - // An enumeration of possible roles that may be taken when a death - // test is encountered. EXECUTE means that the death test logic should - // be executed immediately. OVERSEE means that the program should prepare - // the appropriate environment for a child process to execute the death - // test, then wait for it to complete. - enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; - - // An enumeration of the three reasons that a test might be aborted. - enum AbortReason { - TEST_ENCOUNTERED_RETURN_STATEMENT, - TEST_THREW_EXCEPTION, - TEST_DID_NOT_DIE - }; - - // Assumes one of the above roles. - virtual TestRole AssumeRole() = 0; - - // Waits for the death test to finish and returns its status. - virtual int Wait() = 0; - - // Returns true if the death test passed; that is, the test process - // exited during the test, its exit status matches a user-supplied - // predicate, and its stderr output matches a user-supplied regular - // expression. - // The user-supplied predicate may be a macro expression rather - // than a function pointer or functor, or else Wait and Passed could - // be combined. - virtual bool Passed(bool exit_status_ok) = 0; - - // Signals that the death test did not die as expected. - virtual void Abort(AbortReason reason) = 0; - - // Returns a human-readable outcome message regarding the outcome of - // the last death test. - static const char* LastMessage(); - - static void set_last_death_test_message(const std::string& message); - - private: - // A string containing a description of the outcome of the last death test. - static std::string last_death_test_message_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); -}; - -// Factory interface for death tests. May be mocked out for testing. -class DeathTestFactory { - public: - virtual ~DeathTestFactory() { } - virtual bool Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test) = 0; -}; - -// A concrete DeathTestFactory implementation for normal use. -class DefaultDeathTestFactory : public DeathTestFactory { - public: - virtual bool Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test); -}; - -// Returns true if exit_status describes a process that was terminated -// by a signal, or exited normally with a nonzero exit code. -GTEST_API_ bool ExitedUnsuccessfully(int exit_status); - -// Traps C++ exceptions escaping statement and reports them as test -// failures. Note that trapping SEH exceptions is not implemented here. -# if GTEST_HAS_EXCEPTIONS -# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } catch (const ::std::exception& gtest_exception) { \ - fprintf(\ - stderr, \ - "\n%s: Caught std::exception-derived exception escaping the " \ - "death test statement. Exception message: %s\n", \ - ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ - gtest_exception.what()); \ - fflush(stderr); \ - death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ - } catch (...) { \ - death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ - } - -# else -# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) - -# endif - -// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, -// ASSERT_EXIT*, and EXPECT_EXIT*. -# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - const ::testing::internal::RE& gtest_regex = (regex); \ - ::testing::internal::DeathTest* gtest_dt; \ - if (!::testing::internal::DeathTest::Create(#statement, >est_regex, \ - __FILE__, __LINE__, >est_dt)) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ - } \ - if (gtest_dt != NULL) { \ - ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \ - gtest_dt_ptr(gtest_dt); \ - switch (gtest_dt->AssumeRole()) { \ - case ::testing::internal::DeathTest::OVERSEE_TEST: \ - if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ - } \ - break; \ - case ::testing::internal::DeathTest::EXECUTE_TEST: { \ - ::testing::internal::DeathTest::ReturnSentinel \ - gtest_sentinel(gtest_dt); \ - GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ - gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ - break; \ - } \ - default: \ - break; \ - } \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \ - fail(::testing::internal::DeathTest::LastMessage()) -// The symbol "fail" here expands to something into which a message -// can be streamed. - -// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in -// NDEBUG mode. In this case we need the statements to be executed, the regex is -// ignored, and the macro must accept a streamed message even though the message -// is never printed. -# define GTEST_EXECUTE_STATEMENT_(statement, regex) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } else \ - ::testing::Message() - -// A class representing the parsed contents of the -// --gtest_internal_run_death_test flag, as it existed when -// RUN_ALL_TESTS was called. -class InternalRunDeathTestFlag { - public: - InternalRunDeathTestFlag(const std::string& a_file, - int a_line, - int an_index, - int a_write_fd) - : file_(a_file), line_(a_line), index_(an_index), - write_fd_(a_write_fd) {} - - ~InternalRunDeathTestFlag() { - if (write_fd_ >= 0) - posix::Close(write_fd_); - } - - const std::string& file() const { return file_; } - int line() const { return line_; } - int index() const { return index_; } - int write_fd() const { return write_fd_; } - - private: - std::string file_; - int line_; - int index_; - int write_fd_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); -}; - -// Returns a newly created InternalRunDeathTestFlag object with fields -// initialized from the GTEST_FLAG(internal_run_death_test) flag if -// the flag is specified; otherwise returns NULL. -InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); - -#else // GTEST_HAS_DEATH_TEST - -// This macro is used for implementing macros such as -// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where -// death tests are not supported. Those macros must compile on such systems -// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on -// systems that support death tests. This allows one to write such a macro -// on a system that does not support death tests and be sure that it will -// compile on a death-test supporting system. -// -// Parameters: -// statement - A statement that a macro such as EXPECT_DEATH would test -// for program termination. This macro has to make sure this -// statement is compiled but not executed, to ensure that -// EXPECT_DEATH_IF_SUPPORTED compiles with a certain -// parameter iff EXPECT_DEATH compiles with it. -// regex - A regex that a macro such as EXPECT_DEATH would use to test -// the output of statement. This parameter has to be -// compiled but not evaluated by this macro, to ensure that -// this macro only accepts expressions that a macro such as -// EXPECT_DEATH would accept. -// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED -// and a return statement for ASSERT_DEATH_IF_SUPPORTED. -// This ensures that ASSERT_DEATH_IF_SUPPORTED will not -// compile inside functions where ASSERT_DEATH doesn't -// compile. -// -// The branch that has an always false condition is used to ensure that -// statement and regex are compiled (and thus syntactically correct) but -// never executed. The unreachable code macro protects the terminator -// statement from generating an 'unreachable code' warning in case -// statement unconditionally returns or throws. The Message constructor at -// the end allows the syntax of streaming additional messages into the -// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. -# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_LOG_(WARNING) \ - << "Death tests are not supported on this platform.\n" \ - << "Statement '" #statement "' cannot be verified."; \ - } else if (::testing::internal::AlwaysFalse()) { \ - ::testing::internal::RE::PartialMatch(".*", (regex)); \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - terminator; \ - } else \ - ::testing::Message() - -#endif // GTEST_HAS_DEATH_TEST - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ - -namespace testing { - -// This flag controls the style of death tests. Valid values are "threadsafe", -// meaning that the death test child process will re-execute the test binary -// from the start, running only a single death test, or "fast", -// meaning that the child process will execute the test logic immediately -// after forking. -GTEST_DECLARE_string_(death_test_style); - -#if GTEST_HAS_DEATH_TEST - -namespace internal { - -// Returns a Boolean value indicating whether the caller is currently -// executing in the context of the death test child process. Tools such as -// Valgrind heap checkers may need this to modify their behavior in death -// tests. IMPORTANT: This is an internal utility. Using it may break the -// implementation of death tests. User code MUST NOT use it. -GTEST_API_ bool InDeathTestChild(); - -} // namespace internal - -// The following macros are useful for writing death tests. - -// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is -// executed: -// -// 1. It generates a warning if there is more than one active -// thread. This is because it's safe to fork() or clone() only -// when there is a single thread. -// -// 2. The parent process clone()s a sub-process and runs the death -// test in it; the sub-process exits with code 0 at the end of the -// death test, if it hasn't exited already. -// -// 3. The parent process waits for the sub-process to terminate. -// -// 4. The parent process checks the exit code and error message of -// the sub-process. -// -// Examples: -// -// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); -// for (int i = 0; i < 5; i++) { -// EXPECT_DEATH(server.ProcessRequest(i), -// "Invalid request .* in ProcessRequest()") -// << "Failed to die on request " << i; -// } -// -// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); -// -// bool KilledBySIGHUP(int exit_code) { -// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; -// } -// -// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); -// -// On the regular expressions used in death tests: -// -// On POSIX-compliant systems (*nix), we use the library, -// which uses the POSIX extended regex syntax. -// -// On other platforms (e.g. Windows), we only support a simple regex -// syntax implemented as part of Google Test. This limited -// implementation should be enough most of the time when writing -// death tests; though it lacks many features you can find in PCRE -// or POSIX extended regex syntax. For example, we don't support -// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and -// repetition count ("x{5,7}"), among others. -// -// Below is the syntax that we do support. We chose it to be a -// subset of both PCRE and POSIX extended regex, so it's easy to -// learn wherever you come from. In the following: 'A' denotes a -// literal character, period (.), or a single \\ escape sequence; -// 'x' and 'y' denote regular expressions; 'm' and 'n' are for -// natural numbers. -// -// c matches any literal character c -// \\d matches any decimal digit -// \\D matches any character that's not a decimal digit -// \\f matches \f -// \\n matches \n -// \\r matches \r -// \\s matches any ASCII whitespace, including \n -// \\S matches any character that's not a whitespace -// \\t matches \t -// \\v matches \v -// \\w matches any letter, _, or decimal digit -// \\W matches any character that \\w doesn't match -// \\c matches any literal character c, which must be a punctuation -// . matches any single character except \n -// A? matches 0 or 1 occurrences of A -// A* matches 0 or many occurrences of A -// A+ matches 1 or many occurrences of A -// ^ matches the beginning of a string (not that of each line) -// $ matches the end of a string (not that of each line) -// xy matches x followed by y -// -// If you accidentally use PCRE or POSIX extended regex features -// not implemented by us, you will get a run-time failure. In that -// case, please try to rewrite your regular expression within the -// above syntax. -// -// This implementation is *not* meant to be as highly tuned or robust -// as a compiled regex library, but should perform well enough for a -// death test, which already incurs significant overhead by launching -// a child process. -// -// Known caveats: -// -// A "threadsafe" style death test obtains the path to the test -// program from argv[0] and re-executes it in the sub-process. For -// simplicity, the current implementation doesn't search the PATH -// when launching the sub-process. This means that the user must -// invoke the test program via a path that contains at least one -// path separator (e.g. path/to/foo_test and -// /absolute/path/to/bar_test are fine, but foo_test is not). This -// is rarely a problem as people usually don't put the test binary -// directory in PATH. -// -// TODO(wan@google.com): make thread-safe death tests search the PATH. - -// Asserts that a given statement causes the program to exit, with an -// integer exit status that satisfies predicate, and emitting error output -// that matches regex. -# define ASSERT_EXIT(statement, predicate, regex) \ - GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) - -// Like ASSERT_EXIT, but continues on to successive tests in the -// test case, if any: -# define EXPECT_EXIT(statement, predicate, regex) \ - GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) - -// Asserts that a given statement causes the program to exit, either by -// explicitly exiting with a nonzero exit code or being killed by a -// signal, and emitting error output that matches regex. -# define ASSERT_DEATH(statement, regex) \ - ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) - -// Like ASSERT_DEATH, but continues on to successive tests in the -// test case, if any: -# define EXPECT_DEATH(statement, regex) \ - EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) - -// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: - -// Tests that an exit code describes a normal exit with a given exit code. -class GTEST_API_ ExitedWithCode { - public: - explicit ExitedWithCode(int exit_code); - bool operator()(int exit_status) const; - private: - // No implementation - assignment is unsupported. - void operator=(const ExitedWithCode& other); - - const int exit_code_; -}; - -# if !GTEST_OS_WINDOWS -// Tests that an exit code describes an exit due to termination by a -// given signal. -class GTEST_API_ KilledBySignal { - public: - explicit KilledBySignal(int signum); - bool operator()(int exit_status) const; - private: - const int signum_; -}; -# endif // !GTEST_OS_WINDOWS - -// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. -// The death testing framework causes this to have interesting semantics, -// since the sideeffects of the call are only visible in opt mode, and not -// in debug mode. -// -// In practice, this can be used to test functions that utilize the -// LOG(DFATAL) macro using the following style: -// -// int DieInDebugOr12(int* sideeffect) { -// if (sideeffect) { -// *sideeffect = 12; -// } -// LOG(DFATAL) << "death"; -// return 12; -// } -// -// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) { -// int sideeffect = 0; -// // Only asserts in dbg. -// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); -// -// #ifdef NDEBUG -// // opt-mode has sideeffect visible. -// EXPECT_EQ(12, sideeffect); -// #else -// // dbg-mode no visible sideeffect. -// EXPECT_EQ(0, sideeffect); -// #endif -// } -// -// This will assert that DieInDebugReturn12InOpt() crashes in debug -// mode, usually due to a DCHECK or LOG(DFATAL), but returns the -// appropriate fallback value (12 in this case) in opt mode. If you -// need to test that a function has appropriate side-effects in opt -// mode, include assertions against the side-effects. A general -// pattern for this is: -// -// EXPECT_DEBUG_DEATH({ -// // Side-effects here will have an effect after this statement in -// // opt mode, but none in debug mode. -// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); -// }, "death"); -// -# ifdef NDEBUG - -# define EXPECT_DEBUG_DEATH(statement, regex) \ - GTEST_EXECUTE_STATEMENT_(statement, regex) - -# define ASSERT_DEBUG_DEATH(statement, regex) \ - GTEST_EXECUTE_STATEMENT_(statement, regex) - -# else - -# define EXPECT_DEBUG_DEATH(statement, regex) \ - EXPECT_DEATH(statement, regex) - -# define ASSERT_DEBUG_DEATH(statement, regex) \ - ASSERT_DEATH(statement, regex) - -# endif // NDEBUG for EXPECT_DEBUG_DEATH -#endif // GTEST_HAS_DEATH_TEST - -// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and -// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if -// death tests are supported; otherwise they just issue a warning. This is -// useful when you are combining death test assertions with normal test -// assertions in one test. -#if GTEST_HAS_DEATH_TEST -# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - EXPECT_DEATH(statement, regex) -# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - ASSERT_DEATH(statement, regex) -#else -# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, ) -# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return) -#endif - -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ -// This file was GENERATED by command: -// pump.py gtest-param-test.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: vladl@google.com (Vlad Losev) -// -// Macros and functions for implementing parameterized tests -// in Google C++ Testing Framework (Google Test) -// -// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ - - -// Value-parameterized tests allow you to test your code with different -// parameters without writing multiple copies of the same test. -// -// Here is how you use value-parameterized tests: - -#if 0 - -// To write value-parameterized tests, first you should define a fixture -// class. It is usually derived from testing::TestWithParam (see below for -// another inheritance scheme that's sometimes useful in more complicated -// class hierarchies), where the type of your parameter values. -// TestWithParam is itself derived from testing::Test. T can be any -// copyable type. If it's a raw pointer, you are responsible for managing the -// lifespan of the pointed values. - -class FooTest : public ::testing::TestWithParam { - // You can implement all the usual class fixture members here. -}; - -// Then, use the TEST_P macro to define as many parameterized tests -// for this fixture as you want. The _P suffix is for "parameterized" -// or "pattern", whichever you prefer to think. - -TEST_P(FooTest, DoesBlah) { - // Inside a test, access the test parameter with the GetParam() method - // of the TestWithParam class: - EXPECT_TRUE(foo.Blah(GetParam())); - ... -} - -TEST_P(FooTest, HasBlahBlah) { - ... -} - -// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test -// case with any set of parameters you want. Google Test defines a number -// of functions for generating test parameters. They return what we call -// (surprise!) parameter generators. Here is a summary of them, which -// are all in the testing namespace: -// -// -// Range(begin, end [, step]) - Yields values {begin, begin+step, -// begin+step+step, ...}. The values do not -// include end. step defaults to 1. -// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. -// ValuesIn(container) - Yields values from a C-style array, an STL -// ValuesIn(begin,end) container, or an iterator range [begin, end). -// Bool() - Yields sequence {false, true}. -// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product -// for the math savvy) of the values generated -// by the N generators. -// -// For more details, see comments at the definitions of these functions below -// in this file. -// -// The following statement will instantiate tests from the FooTest test case -// each with parameter values "meeny", "miny", and "moe". - -INSTANTIATE_TEST_CASE_P(InstantiationName, - FooTest, - Values("meeny", "miny", "moe")); - -// To distinguish different instances of the pattern, (yes, you -// can instantiate it more then once) the first argument to the -// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the -// actual test case name. Remember to pick unique prefixes for different -// instantiations. The tests from the instantiation above will have -// these names: -// -// * InstantiationName/FooTest.DoesBlah/0 for "meeny" -// * InstantiationName/FooTest.DoesBlah/1 for "miny" -// * InstantiationName/FooTest.DoesBlah/2 for "moe" -// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" -// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" -// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" -// -// You can use these names in --gtest_filter. -// -// This statement will instantiate all tests from FooTest again, each -// with parameter values "cat" and "dog": - -const char* pets[] = {"cat", "dog"}; -INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); - -// The tests from the instantiation above will have these names: -// -// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" -// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" -// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" -// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" -// -// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests -// in the given test case, whether their definitions come before or -// AFTER the INSTANTIATE_TEST_CASE_P statement. -// -// Please also note that generator expressions (including parameters to the -// generators) are evaluated in InitGoogleTest(), after main() has started. -// This allows the user on one hand, to adjust generator parameters in order -// to dynamically determine a set of tests to run and on the other hand, -// give the user a chance to inspect the generated tests with Google Test -// reflection API before RUN_ALL_TESTS() is executed. -// -// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc -// for more examples. -// -// In the future, we plan to publish the API for defining new parameter -// generators. But for now this interface remains part of the internal -// implementation and is subject to change. -// -// -// A parameterized test fixture must be derived from testing::Test and from -// testing::WithParamInterface, where T is the type of the parameter -// values. Inheriting from TestWithParam satisfies that requirement because -// TestWithParam inherits from both Test and WithParamInterface. In more -// complicated hierarchies, however, it is occasionally useful to inherit -// separately from Test and WithParamInterface. For example: - -class BaseTest : public ::testing::Test { - // You can inherit all the usual members for a non-parameterized test - // fixture here. -}; - -class DerivedTest : public BaseTest, public ::testing::WithParamInterface { - // The usual test fixture members go here too. -}; - -TEST_F(BaseTest, HasFoo) { - // This is an ordinary non-parameterized test. -} - -TEST_P(DerivedTest, DoesBlah) { - // GetParam works just the same here as if you inherit from TestWithParam. - EXPECT_TRUE(foo.Blah(GetParam())); -} - -#endif // 0 - - -#if !GTEST_OS_SYMBIAN -# include -#endif - -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: vladl@google.com (Vlad Losev) - -// Type and function utilities for implementing parameterized tests. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ - -#include -#include -#include - -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. -// Copyright 2003 Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: Dan Egnor (egnor@google.com) -// -// A "smart" pointer type with reference tracking. Every pointer to a -// particular object is kept on a circular linked list. When the last pointer -// to an object is destroyed or reassigned, the object is deleted. -// -// Used properly, this deletes the object when the last reference goes away. -// There are several caveats: -// - Like all reference counting schemes, cycles lead to leaks. -// - Each smart pointer is actually two pointers (8 bytes instead of 4). -// - Every time a pointer is assigned, the entire list of pointers to that -// object is traversed. This class is therefore NOT SUITABLE when there -// will often be more than two or three pointers to a particular object. -// - References are only tracked as long as linked_ptr<> objects are copied. -// If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS -// will happen (double deletion). -// -// A good use of this class is storing object references in STL containers. -// You can safely put linked_ptr<> in a vector<>. -// Other uses may not be as good. -// -// Note: If you use an incomplete type with linked_ptr<>, the class -// *containing* linked_ptr<> must have a constructor and destructor (even -// if they do nothing!). -// -// Bill Gibbons suggested we use something like this. -// -// Thread Safety: -// Unlike other linked_ptr implementations, in this implementation -// a linked_ptr object is thread-safe in the sense that: -// - it's safe to copy linked_ptr objects concurrently, -// - it's safe to copy *from* a linked_ptr and read its underlying -// raw pointer (e.g. via get()) concurrently, and -// - it's safe to write to two linked_ptrs that point to the same -// shared object concurrently. -// TODO(wan@google.com): rename this to safe_linked_ptr to avoid -// confusion with normal linked_ptr. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ - -#include -#include - - -namespace testing { -namespace internal { - -// Protects copying of all linked_ptr objects. -GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex); - -// This is used internally by all instances of linked_ptr<>. It needs to be -// a non-template class because different types of linked_ptr<> can refer to -// the same object (linked_ptr(obj) vs linked_ptr(obj)). -// So, it needs to be possible for different types of linked_ptr to participate -// in the same circular linked list, so we need a single class type here. -// -// DO NOT USE THIS CLASS DIRECTLY YOURSELF. Use linked_ptr. -class linked_ptr_internal { - public: - // Create a new circle that includes only this instance. - void join_new() { - next_ = this; - } - - // Many linked_ptr operations may change p.link_ for some linked_ptr - // variable p in the same circle as this object. Therefore we need - // to prevent two such operations from occurring concurrently. - // - // Note that different types of linked_ptr objects can coexist in a - // circle (e.g. linked_ptr, linked_ptr, and - // linked_ptr). Therefore we must use a single mutex to - // protect all linked_ptr objects. This can create serious - // contention in production code, but is acceptable in a testing - // framework. - - // Join an existing circle. - void join(linked_ptr_internal const* ptr) - GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { - MutexLock lock(&g_linked_ptr_mutex); - - linked_ptr_internal const* p = ptr; - while (p->next_ != ptr) p = p->next_; - p->next_ = this; - next_ = ptr; - } - - // Leave whatever circle we're part of. Returns true if we were the - // last member of the circle. Once this is done, you can join() another. - bool depart() - GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { - MutexLock lock(&g_linked_ptr_mutex); - - if (next_ == this) return true; - linked_ptr_internal const* p = next_; - while (p->next_ != this) p = p->next_; - p->next_ = next_; - return false; - } - - private: - mutable linked_ptr_internal const* next_; -}; - -template -class linked_ptr { - public: - typedef T element_type; - - // Take over ownership of a raw pointer. This should happen as soon as - // possible after the object is created. - explicit linked_ptr(T* ptr = NULL) { capture(ptr); } - ~linked_ptr() { depart(); } - - // Copy an existing linked_ptr<>, adding ourselves to the list of references. - template linked_ptr(linked_ptr const& ptr) { copy(&ptr); } - linked_ptr(linked_ptr const& ptr) { // NOLINT - assert(&ptr != this); - copy(&ptr); - } - - // Assignment releases the old value and acquires the new. - template linked_ptr& operator=(linked_ptr const& ptr) { - depart(); - copy(&ptr); - return *this; - } - - linked_ptr& operator=(linked_ptr const& ptr) { - if (&ptr != this) { - depart(); - copy(&ptr); - } - return *this; - } - - // Smart pointer members. - void reset(T* ptr = NULL) { - depart(); - capture(ptr); - } - T* get() const { return value_; } - T* operator->() const { return value_; } - T& operator*() const { return *value_; } - - bool operator==(T* p) const { return value_ == p; } - bool operator!=(T* p) const { return value_ != p; } - template - bool operator==(linked_ptr const& ptr) const { - return value_ == ptr.get(); - } - template - bool operator!=(linked_ptr const& ptr) const { - return value_ != ptr.get(); - } - - private: - template - friend class linked_ptr; - - T* value_; - linked_ptr_internal link_; - - void depart() { - if (link_.depart()) delete value_; - } - - void capture(T* ptr) { - value_ = ptr; - link_.join_new(); - } - - template void copy(linked_ptr const* ptr) { - value_ = ptr->get(); - if (value_) - link_.join(&ptr->link_); - else - link_.join_new(); - } -}; - -template inline -bool operator==(T* ptr, const linked_ptr& x) { - return ptr == x.get(); -} - -template inline -bool operator!=(T* ptr, const linked_ptr& x) { - return ptr != x.get(); -} - -// A function to convert T* into linked_ptr -// Doing e.g. make_linked_ptr(new FooBarBaz(arg)) is a shorter notation -// for linked_ptr >(new FooBarBaz(arg)) -template -linked_ptr make_linked_ptr(T* ptr) { - return linked_ptr(ptr); -} - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ -// Copyright 2007, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -// Google Test - The Google C++ Testing Framework -// -// This file implements a universal value printer that can print a -// value of any type T: -// -// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); -// -// A user can teach this function how to print a class type T by -// defining either operator<<() or PrintTo() in the namespace that -// defines T. More specifically, the FIRST defined function in the -// following list will be used (assuming T is defined in namespace -// foo): -// -// 1. foo::PrintTo(const T&, ostream*) -// 2. operator<<(ostream&, const T&) defined in either foo or the -// global namespace. -// -// If none of the above is defined, it will print the debug string of -// the value if it is a protocol buffer, or print the raw bytes in the -// value otherwise. -// -// To aid debugging: when T is a reference type, the address of the -// value is also printed; when T is a (const) char pointer, both the -// pointer value and the NUL-terminated string it points to are -// printed. -// -// We also provide some convenient wrappers: -// -// // Prints a value to a string. For a (const or not) char -// // pointer, the NUL-terminated string (but not the pointer) is -// // printed. -// std::string ::testing::PrintToString(const T& value); -// -// // Prints a value tersely: for a reference type, the referenced -// // value (but not the address) is printed; for a (const or not) char -// // pointer, the NUL-terminated string (but not the pointer) is -// // printed. -// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); -// -// // Prints value using the type inferred by the compiler. The difference -// // from UniversalTersePrint() is that this function prints both the -// // pointer and the NUL-terminated string for a (const or not) char pointer. -// void ::testing::internal::UniversalPrint(const T& value, ostream*); -// -// // Prints the fields of a tuple tersely to a string vector, one -// // element for each field. Tuple support must be enabled in -// // gtest-port.h. -// std::vector UniversalTersePrintTupleFieldsToStrings( -// const Tuple& value); -// -// Known limitation: -// -// The print primitives print the elements of an STL-style container -// using the compiler-inferred type of *iter where iter is a -// const_iterator of the container. When const_iterator is an input -// iterator but not a forward iterator, this inferred type may not -// match value_type, and the print output may be incorrect. In -// practice, this is rarely a problem as for most containers -// const_iterator is a forward iterator. We'll fix this if there's an -// actual need for it. Note that this fix cannot rely on value_type -// being defined as many user-defined container types don't have -// value_type. - -#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ - -#include // NOLINT -#include -#include -#include -#include - -namespace testing { - -// Definitions in the 'internal' and 'internal2' name spaces are -// subject to change without notice. DO NOT USE THEM IN USER CODE! -namespace internal2 { - -// Prints the given number of bytes in the given object to the given -// ostream. -GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, - size_t count, - ::std::ostream* os); - -// For selecting which printer to use when a given type has neither << -// nor PrintTo(). -enum TypeKind { - kProtobuf, // a protobuf type - kConvertibleToInteger, // a type implicitly convertible to BiggestInt - // (e.g. a named or unnamed enum type) - kOtherType // anything else -}; - -// TypeWithoutFormatter::PrintValue(value, os) is called -// by the universal printer to print a value of type T when neither -// operator<< nor PrintTo() is defined for T, where kTypeKind is the -// "kind" of T as defined by enum TypeKind. -template -class TypeWithoutFormatter { - public: - // This default version is called when kTypeKind is kOtherType. - static void PrintValue(const T& value, ::std::ostream* os) { - PrintBytesInObjectTo(reinterpret_cast(&value), - sizeof(value), os); - } -}; - -// We print a protobuf using its ShortDebugString() when the string -// doesn't exceed this many characters; otherwise we print it using -// DebugString() for better readability. -const size_t kProtobufOneLinerMaxLength = 50; - -template -class TypeWithoutFormatter { - public: - static void PrintValue(const T& value, ::std::ostream* os) { - const ::testing::internal::string short_str = value.ShortDebugString(); - const ::testing::internal::string pretty_str = - short_str.length() <= kProtobufOneLinerMaxLength ? - short_str : ("\n" + value.DebugString()); - *os << ("<" + pretty_str + ">"); - } -}; - -template -class TypeWithoutFormatter { - public: - // Since T has no << operator or PrintTo() but can be implicitly - // converted to BiggestInt, we print it as a BiggestInt. - // - // Most likely T is an enum type (either named or unnamed), in which - // case printing it as an integer is the desired behavior. In case - // T is not an enum, printing it as an integer is the best we can do - // given that it has no user-defined printer. - static void PrintValue(const T& value, ::std::ostream* os) { - const internal::BiggestInt kBigInt = value; - *os << kBigInt; - } -}; - -// Prints the given value to the given ostream. If the value is a -// protocol message, its debug string is printed; if it's an enum or -// of a type implicitly convertible to BiggestInt, it's printed as an -// integer; otherwise the bytes in the value are printed. This is -// what UniversalPrinter::Print() does when it knows nothing about -// type T and T has neither << operator nor PrintTo(). -// -// A user can override this behavior for a class type Foo by defining -// a << operator in the namespace where Foo is defined. -// -// We put this operator in namespace 'internal2' instead of 'internal' -// to simplify the implementation, as much code in 'internal' needs to -// use << in STL, which would conflict with our own << were it defined -// in 'internal'. -// -// Note that this operator<< takes a generic std::basic_ostream type instead of the more restricted std::ostream. If -// we define it to take an std::ostream instead, we'll get an -// "ambiguous overloads" compiler error when trying to print a type -// Foo that supports streaming to std::basic_ostream, as the compiler cannot tell whether -// operator<<(std::ostream&, const T&) or -// operator<<(std::basic_stream, const Foo&) is more -// specific. -template -::std::basic_ostream& operator<<( - ::std::basic_ostream& os, const T& x) { - TypeWithoutFormatter::value ? kProtobuf : - internal::ImplicitlyConvertible::value ? - kConvertibleToInteger : kOtherType)>::PrintValue(x, &os); - return os; -} - -} // namespace internal2 -} // namespace testing - -// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up -// magic needed for implementing UniversalPrinter won't work. -namespace testing_internal { - -// Used to print a value that is not an STL-style container when the -// user doesn't define PrintTo() for it. -template -void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) { - // With the following statement, during unqualified name lookup, - // testing::internal2::operator<< appears as if it was declared in - // the nearest enclosing namespace that contains both - // ::testing_internal and ::testing::internal2, i.e. the global - // namespace. For more details, refer to the C++ Standard section - // 7.3.4-1 [namespace.udir]. This allows us to fall back onto - // testing::internal2::operator<< in case T doesn't come with a << - // operator. - // - // We cannot write 'using ::testing::internal2::operator<<;', which - // gcc 3.3 fails to compile due to a compiler bug. - using namespace ::testing::internal2; // NOLINT - - // Assuming T is defined in namespace foo, in the next statement, - // the compiler will consider all of: - // - // 1. foo::operator<< (thanks to Koenig look-up), - // 2. ::operator<< (as the current namespace is enclosed in ::), - // 3. testing::internal2::operator<< (thanks to the using statement above). - // - // The operator<< whose type matches T best will be picked. - // - // We deliberately allow #2 to be a candidate, as sometimes it's - // impossible to define #1 (e.g. when foo is ::std, defining - // anything in it is undefined behavior unless you are a compiler - // vendor.). - *os << value; -} - -} // namespace testing_internal - -namespace testing { -namespace internal { - -// UniversalPrinter::Print(value, ostream_ptr) prints the given -// value to the given ostream. The caller must ensure that -// 'ostream_ptr' is not NULL, or the behavior is undefined. -// -// We define UniversalPrinter as a class template (as opposed to a -// function template), as we need to partially specialize it for -// reference types, which cannot be done with function templates. -template -class UniversalPrinter; - -template -void UniversalPrint(const T& value, ::std::ostream* os); - -// Used to print an STL-style container when the user doesn't define -// a PrintTo() for it. -template -void DefaultPrintTo(IsContainer /* dummy */, - false_type /* is not a pointer */, - const C& container, ::std::ostream* os) { - const size_t kMaxCount = 32; // The maximum number of elements to print. - *os << '{'; - size_t count = 0; - for (typename C::const_iterator it = container.begin(); - it != container.end(); ++it, ++count) { - if (count > 0) { - *os << ','; - if (count == kMaxCount) { // Enough has been printed. - *os << " ..."; - break; - } - } - *os << ' '; - // We cannot call PrintTo(*it, os) here as PrintTo() doesn't - // handle *it being a native array. - internal::UniversalPrint(*it, os); - } - - if (count > 0) { - *os << ' '; - } - *os << '}'; -} - -// Used to print a pointer that is neither a char pointer nor a member -// pointer, when the user doesn't define PrintTo() for it. (A member -// variable pointer or member function pointer doesn't really point to -// a location in the address space. Their representation is -// implementation-defined. Therefore they will be printed as raw -// bytes.) -template -void DefaultPrintTo(IsNotContainer /* dummy */, - true_type /* is a pointer */, - T* p, ::std::ostream* os) { - if (p == NULL) { - *os << "NULL"; - } else { - // C++ doesn't allow casting from a function pointer to any object - // pointer. - // - // IsTrue() silences warnings: "Condition is always true", - // "unreachable code". - if (IsTrue(ImplicitlyConvertible::value)) { - // T is not a function type. We just call << to print p, - // relying on ADL to pick up user-defined << for their pointer - // types, if any. - *os << p; - } else { - // T is a function type, so '*os << p' doesn't do what we want - // (it just prints p as bool). We want to print p as a const - // void*. However, we cannot cast it to const void* directly, - // even using reinterpret_cast, as earlier versions of gcc - // (e.g. 3.4.5) cannot compile the cast when p is a function - // pointer. Casting to UInt64 first solves the problem. - *os << reinterpret_cast( - reinterpret_cast(p)); - } - } -} - -// Used to print a non-container, non-pointer value when the user -// doesn't define PrintTo() for it. -template -void DefaultPrintTo(IsNotContainer /* dummy */, - false_type /* is not a pointer */, - const T& value, ::std::ostream* os) { - ::testing_internal::DefaultPrintNonContainerTo(value, os); -} - -// Prints the given value using the << operator if it has one; -// otherwise prints the bytes in it. This is what -// UniversalPrinter::Print() does when PrintTo() is not specialized -// or overloaded for type T. -// -// A user can override this behavior for a class type Foo by defining -// an overload of PrintTo() in the namespace where Foo is defined. We -// give the user this option as sometimes defining a << operator for -// Foo is not desirable (e.g. the coding style may prevent doing it, -// or there is already a << operator but it doesn't do what the user -// wants). -template -void PrintTo(const T& value, ::std::ostream* os) { - // DefaultPrintTo() is overloaded. The type of its first two - // arguments determine which version will be picked. If T is an - // STL-style container, the version for container will be called; if - // T is a pointer, the pointer version will be called; otherwise the - // generic version will be called. - // - // Note that we check for container types here, prior to we check - // for protocol message types in our operator<<. The rationale is: - // - // For protocol messages, we want to give people a chance to - // override Google Mock's format by defining a PrintTo() or - // operator<<. For STL containers, other formats can be - // incompatible with Google Mock's format for the container - // elements; therefore we check for container types here to ensure - // that our format is used. - // - // The second argument of DefaultPrintTo() is needed to bypass a bug - // in Symbian's C++ compiler that prevents it from picking the right - // overload between: - // - // PrintTo(const T& x, ...); - // PrintTo(T* x, ...); - DefaultPrintTo(IsContainerTest(0), is_pointer(), value, os); -} - -// The following list of PrintTo() overloads tells -// UniversalPrinter::Print() how to print standard types (built-in -// types, strings, plain arrays, and pointers). - -// Overloads for various char types. -GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os); -GTEST_API_ void PrintTo(signed char c, ::std::ostream* os); -inline void PrintTo(char c, ::std::ostream* os) { - // When printing a plain char, we always treat it as unsigned. This - // way, the output won't be affected by whether the compiler thinks - // char is signed or not. - PrintTo(static_cast(c), os); -} - -// Overloads for other simple built-in types. -inline void PrintTo(bool x, ::std::ostream* os) { - *os << (x ? "true" : "false"); -} - -// Overload for wchar_t type. -// Prints a wchar_t as a symbol if it is printable or as its internal -// code otherwise and also as its decimal code (except for L'\0'). -// The L'\0' char is printed as "L'\\0'". The decimal code is printed -// as signed integer when wchar_t is implemented by the compiler -// as a signed type and is printed as an unsigned integer when wchar_t -// is implemented as an unsigned type. -GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); - -// Overloads for C strings. -GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); -inline void PrintTo(char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} - -// signed/unsigned char is often used for representing binary data, so -// we print pointers to it as void* to be safe. -inline void PrintTo(const signed char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} -inline void PrintTo(signed char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} -inline void PrintTo(const unsigned char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} -inline void PrintTo(unsigned char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} - -// MSVC can be configured to define wchar_t as a typedef of unsigned -// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native -// type. When wchar_t is a typedef, defining an overload for const -// wchar_t* would cause unsigned short* be printed as a wide string, -// possibly causing invalid memory accesses. -#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) -// Overloads for wide C strings -GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os); -inline void PrintTo(wchar_t* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} -#endif - -// Overload for C arrays. Multi-dimensional arrays are printed -// properly. - -// Prints the given number of elements in an array, without printing -// the curly braces. -template -void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { - UniversalPrint(a[0], os); - for (size_t i = 1; i != count; i++) { - *os << ", "; - UniversalPrint(a[i], os); - } -} - -// Overloads for ::string and ::std::string. -#if GTEST_HAS_GLOBAL_STRING -GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os); -inline void PrintTo(const ::string& s, ::std::ostream* os) { - PrintStringTo(s, os); -} -#endif // GTEST_HAS_GLOBAL_STRING - -GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); -inline void PrintTo(const ::std::string& s, ::std::ostream* os) { - PrintStringTo(s, os); -} - -// Overloads for ::wstring and ::std::wstring. -#if GTEST_HAS_GLOBAL_WSTRING -GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os); -inline void PrintTo(const ::wstring& s, ::std::ostream* os) { - PrintWideStringTo(s, os); -} -#endif // GTEST_HAS_GLOBAL_WSTRING - -#if GTEST_HAS_STD_WSTRING -GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); -inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { - PrintWideStringTo(s, os); -} -#endif // GTEST_HAS_STD_WSTRING - -#if GTEST_HAS_TR1_TUPLE -// Overload for ::std::tr1::tuple. Needed for printing function arguments, -// which are packed as tuples. - -// Helper function for printing a tuple. T must be instantiated with -// a tuple type. -template -void PrintTupleTo(const T& t, ::std::ostream* os); - -// Overloaded PrintTo() for tuples of various arities. We support -// tuples of up-to 10 fields. The following implementation works -// regardless of whether tr1::tuple is implemented using the -// non-standard variadic template feature or not. - -inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo( - const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} -#endif // GTEST_HAS_TR1_TUPLE - -// Overload for std::pair. -template -void PrintTo(const ::std::pair& value, ::std::ostream* os) { - *os << '('; - // We cannot use UniversalPrint(value.first, os) here, as T1 may be - // a reference type. The same for printing value.second. - UniversalPrinter::Print(value.first, os); - *os << ", "; - UniversalPrinter::Print(value.second, os); - *os << ')'; -} - -// Implements printing a non-reference type T by letting the compiler -// pick the right overload of PrintTo() for T. -template -class UniversalPrinter { - public: - // MSVC warns about adding const to a function type, so we want to - // disable the warning. -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4180) // Temporarily disables warning 4180. -#endif // _MSC_VER - - // Note: we deliberately don't call this PrintTo(), as that name - // conflicts with ::testing::internal::PrintTo in the body of the - // function. - static void Print(const T& value, ::std::ostream* os) { - // By default, ::testing::internal::PrintTo() is used for printing - // the value. - // - // Thanks to Koenig look-up, if T is a class and has its own - // PrintTo() function defined in its namespace, that function will - // be visible here. Since it is more specific than the generic ones - // in ::testing::internal, it will be picked by the compiler in the - // following statement - exactly what we want. - PrintTo(value, os); - } - -#ifdef _MSC_VER -# pragma warning(pop) // Restores the warning state. -#endif // _MSC_VER -}; - -// UniversalPrintArray(begin, len, os) prints an array of 'len' -// elements, starting at address 'begin'. -template -void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { - if (len == 0) { - *os << "{}"; - } else { - *os << "{ "; - const size_t kThreshold = 18; - const size_t kChunkSize = 8; - // If the array has more than kThreshold elements, we'll have to - // omit some details by printing only the first and the last - // kChunkSize elements. - // TODO(wan@google.com): let the user control the threshold using a flag. - if (len <= kThreshold) { - PrintRawArrayTo(begin, len, os); - } else { - PrintRawArrayTo(begin, kChunkSize, os); - *os << ", ..., "; - PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); - } - *os << " }"; - } -} -// This overload prints a (const) char array compactly. -GTEST_API_ void UniversalPrintArray( - const char* begin, size_t len, ::std::ostream* os); - -// This overload prints a (const) wchar_t array compactly. -GTEST_API_ void UniversalPrintArray( - const wchar_t* begin, size_t len, ::std::ostream* os); - -// Implements printing an array type T[N]. -template -class UniversalPrinter { - public: - // Prints the given array, omitting some elements when there are too - // many. - static void Print(const T (&a)[N], ::std::ostream* os) { - UniversalPrintArray(a, N, os); - } -}; - -// Implements printing a reference type T&. -template -class UniversalPrinter { - public: - // MSVC warns about adding const to a function type, so we want to - // disable the warning. -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4180) // Temporarily disables warning 4180. -#endif // _MSC_VER - - static void Print(const T& value, ::std::ostream* os) { - // Prints the address of the value. We use reinterpret_cast here - // as static_cast doesn't compile when T is a function type. - *os << "@" << reinterpret_cast(&value) << " "; - - // Then prints the value itself. - UniversalPrint(value, os); - } - -#ifdef _MSC_VER -# pragma warning(pop) // Restores the warning state. -#endif // _MSC_VER -}; - -// Prints a value tersely: for a reference type, the referenced value -// (but not the address) is printed; for a (const) char pointer, the -// NUL-terminated string (but not the pointer) is printed. - -template -class UniversalTersePrinter { - public: - static void Print(const T& value, ::std::ostream* os) { - UniversalPrint(value, os); - } -}; -template -class UniversalTersePrinter { - public: - static void Print(const T& value, ::std::ostream* os) { - UniversalPrint(value, os); - } -}; -template -class UniversalTersePrinter { - public: - static void Print(const T (&value)[N], ::std::ostream* os) { - UniversalPrinter::Print(value, os); - } -}; -template <> -class UniversalTersePrinter { - public: - static void Print(const char* str, ::std::ostream* os) { - if (str == NULL) { - *os << "NULL"; - } else { - UniversalPrint(string(str), os); - } - } -}; -template <> -class UniversalTersePrinter { - public: - static void Print(char* str, ::std::ostream* os) { - UniversalTersePrinter::Print(str, os); - } -}; - -#if GTEST_HAS_STD_WSTRING -template <> -class UniversalTersePrinter { - public: - static void Print(const wchar_t* str, ::std::ostream* os) { - if (str == NULL) { - *os << "NULL"; - } else { - UniversalPrint(::std::wstring(str), os); - } - } -}; -#endif - -template <> -class UniversalTersePrinter { - public: - static void Print(wchar_t* str, ::std::ostream* os) { - UniversalTersePrinter::Print(str, os); - } -}; - -template -void UniversalTersePrint(const T& value, ::std::ostream* os) { - UniversalTersePrinter::Print(value, os); -} - -// Prints a value using the type inferred by the compiler. The -// difference between this and UniversalTersePrint() is that for a -// (const) char pointer, this prints both the pointer and the -// NUL-terminated string. -template -void UniversalPrint(const T& value, ::std::ostream* os) { - // A workarond for the bug in VC++ 7.1 that prevents us from instantiating - // UniversalPrinter with T directly. - typedef T T1; - UniversalPrinter::Print(value, os); -} - -#if GTEST_HAS_TR1_TUPLE -typedef ::std::vector Strings; - -// This helper template allows PrintTo() for tuples and -// UniversalTersePrintTupleFieldsToStrings() to be defined by -// induction on the number of tuple fields. The idea is that -// TuplePrefixPrinter::PrintPrefixTo(t, os) prints the first N -// fields in tuple t, and can be defined in terms of -// TuplePrefixPrinter. - -// The inductive case. -template -struct TuplePrefixPrinter { - // Prints the first N fields of a tuple. - template - static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { - TuplePrefixPrinter::PrintPrefixTo(t, os); - *os << ", "; - UniversalPrinter::type> - ::Print(::std::tr1::get(t), os); - } - - // Tersely prints the first N fields of a tuple to a string vector, - // one element for each field. - template - static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { - TuplePrefixPrinter::TersePrintPrefixToStrings(t, strings); - ::std::stringstream ss; - UniversalTersePrint(::std::tr1::get(t), &ss); - strings->push_back(ss.str()); - } -}; - -// Base cases. -template <> -struct TuplePrefixPrinter<0> { - template - static void PrintPrefixTo(const Tuple&, ::std::ostream*) {} - - template - static void TersePrintPrefixToStrings(const Tuple&, Strings*) {} -}; -// We have to specialize the entire TuplePrefixPrinter<> class -// template here, even though the definition of -// TersePrintPrefixToStrings() is the same as the generic version, as -// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't -// support specializing a method template of a class template. -template <> -struct TuplePrefixPrinter<1> { - template - static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { - UniversalPrinter::type>:: - Print(::std::tr1::get<0>(t), os); - } - - template - static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { - ::std::stringstream ss; - UniversalTersePrint(::std::tr1::get<0>(t), &ss); - strings->push_back(ss.str()); - } -}; - -// Helper function for printing a tuple. T must be instantiated with -// a tuple type. -template -void PrintTupleTo(const T& t, ::std::ostream* os) { - *os << "("; - TuplePrefixPrinter< ::std::tr1::tuple_size::value>:: - PrintPrefixTo(t, os); - *os << ")"; -} - -// Prints the fields of a tuple tersely to a string vector, one -// element for each field. See the comment before -// UniversalTersePrint() for how we define "tersely". -template -Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { - Strings result; - TuplePrefixPrinter< ::std::tr1::tuple_size::value>:: - TersePrintPrefixToStrings(value, &result); - return result; -} -#endif // GTEST_HAS_TR1_TUPLE - -} // namespace internal - -template -::std::string PrintToString(const T& value) { - ::std::stringstream ss; - internal::UniversalTersePrinter::Print(value, &ss); - return ss.str(); -} - -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ - -#if GTEST_HAS_PARAM_TEST - -namespace testing { -namespace internal { - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Outputs a message explaining invalid registration of different -// fixture class for the same test case. This may happen when -// TEST_P macro is used to define two tests with the same name -// but in different namespaces. -GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name, - const char* file, int line); - -template class ParamGeneratorInterface; -template class ParamGenerator; - -// Interface for iterating over elements provided by an implementation -// of ParamGeneratorInterface. -template -class ParamIteratorInterface { - public: - virtual ~ParamIteratorInterface() {} - // A pointer to the base generator instance. - // Used only for the purposes of iterator comparison - // to make sure that two iterators belong to the same generator. - virtual const ParamGeneratorInterface* BaseGenerator() const = 0; - // Advances iterator to point to the next element - // provided by the generator. The caller is responsible - // for not calling Advance() on an iterator equal to - // BaseGenerator()->End(). - virtual void Advance() = 0; - // Clones the iterator object. Used for implementing copy semantics - // of ParamIterator. - virtual ParamIteratorInterface* Clone() const = 0; - // Dereferences the current iterator and provides (read-only) access - // to the pointed value. It is the caller's responsibility not to call - // Current() on an iterator equal to BaseGenerator()->End(). - // Used for implementing ParamGenerator::operator*(). - virtual const T* Current() const = 0; - // Determines whether the given iterator and other point to the same - // element in the sequence generated by the generator. - // Used for implementing ParamGenerator::operator==(). - virtual bool Equals(const ParamIteratorInterface& other) const = 0; -}; - -// Class iterating over elements provided by an implementation of -// ParamGeneratorInterface. It wraps ParamIteratorInterface -// and implements the const forward iterator concept. -template -class ParamIterator { - public: - typedef T value_type; - typedef const T& reference; - typedef ptrdiff_t difference_type; - - // ParamIterator assumes ownership of the impl_ pointer. - ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} - ParamIterator& operator=(const ParamIterator& other) { - if (this != &other) - impl_.reset(other.impl_->Clone()); - return *this; - } - - const T& operator*() const { return *impl_->Current(); } - const T* operator->() const { return impl_->Current(); } - // Prefix version of operator++. - ParamIterator& operator++() { - impl_->Advance(); - return *this; - } - // Postfix version of operator++. - ParamIterator operator++(int /*unused*/) { - ParamIteratorInterface* clone = impl_->Clone(); - impl_->Advance(); - return ParamIterator(clone); - } - bool operator==(const ParamIterator& other) const { - return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); - } - bool operator!=(const ParamIterator& other) const { - return !(*this == other); - } - - private: - friend class ParamGenerator; - explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} - scoped_ptr > impl_; -}; - -// ParamGeneratorInterface is the binary interface to access generators -// defined in other translation units. -template -class ParamGeneratorInterface { - public: - typedef T ParamType; - - virtual ~ParamGeneratorInterface() {} - - // Generator interface definition - virtual ParamIteratorInterface* Begin() const = 0; - virtual ParamIteratorInterface* End() const = 0; -}; - -// Wraps ParamGeneratorInterface and provides general generator syntax -// compatible with the STL Container concept. -// This class implements copy initialization semantics and the contained -// ParamGeneratorInterface instance is shared among all copies -// of the original object. This is possible because that instance is immutable. -template -class ParamGenerator { - public: - typedef ParamIterator iterator; - - explicit ParamGenerator(ParamGeneratorInterface* impl) : impl_(impl) {} - ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {} - - ParamGenerator& operator=(const ParamGenerator& other) { - impl_ = other.impl_; - return *this; - } - - iterator begin() const { return iterator(impl_->Begin()); } - iterator end() const { return iterator(impl_->End()); } - - private: - linked_ptr > impl_; -}; - -// Generates values from a range of two comparable values. Can be used to -// generate sequences of user-defined types that implement operator+() and -// operator<(). -// This class is used in the Range() function. -template -class RangeGenerator : public ParamGeneratorInterface { - public: - RangeGenerator(T begin, T end, IncrementT step) - : begin_(begin), end_(end), - step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} - virtual ~RangeGenerator() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, begin_, 0, step_); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, end_, end_index_, step_); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, T value, int index, - IncrementT step) - : base_(base), value_(value), index_(index), step_(step) {} - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - virtual void Advance() { - value_ = value_ + step_; - index_++; - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const T* Current() const { return &value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const int other_index = - CheckedDowncastToActualType(&other)->index_; - return index_ == other_index; - } - - private: - Iterator(const Iterator& other) - : ParamIteratorInterface(), - base_(other.base_), value_(other.value_), index_(other.index_), - step_(other.step_) {} - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - T value_; - int index_; - const IncrementT step_; - }; // class RangeGenerator::Iterator - - static int CalculateEndIndex(const T& begin, - const T& end, - const IncrementT& step) { - int end_index = 0; - for (T i = begin; i < end; i = i + step) - end_index++; - return end_index; - } - - // No implementation - assignment is unsupported. - void operator=(const RangeGenerator& other); - - const T begin_; - const T end_; - const IncrementT step_; - // The index for the end() iterator. All the elements in the generated - // sequence are indexed (0-based) to aid iterator comparison. - const int end_index_; -}; // class RangeGenerator - - -// Generates values from a pair of STL-style iterators. Used in the -// ValuesIn() function. The elements are copied from the source range -// since the source can be located on the stack, and the generator -// is likely to persist beyond that stack frame. -template -class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { - public: - template - ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) - : container_(begin, end) {} - virtual ~ValuesInIteratorRangeGenerator() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, container_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, container_.end()); - } - - private: - typedef typename ::std::vector ContainerType; - - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - typename ContainerType::const_iterator iterator) - : base_(base), iterator_(iterator) {} - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - virtual void Advance() { - ++iterator_; - value_.reset(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - // We need to use cached value referenced by iterator_ because *iterator_ - // can return a temporary object (and of type other then T), so just - // having "return &*iterator_;" doesn't work. - // value_ is updated here and not in Advance() because Advance() - // can advance iterator_ beyond the end of the range, and we cannot - // detect that fact. The client code, on the other hand, is - // responsible for not calling Current() on an out-of-range iterator. - virtual const T* Current() const { - if (value_.get() == NULL) - value_.reset(new T(*iterator_)); - return value_.get(); - } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - return iterator_ == - CheckedDowncastToActualType(&other)->iterator_; - } - - private: - Iterator(const Iterator& other) - // The explicit constructor call suppresses a false warning - // emitted by gcc when supplied with the -Wextra option. - : ParamIteratorInterface(), - base_(other.base_), - iterator_(other.iterator_) {} - - const ParamGeneratorInterface* const base_; - typename ContainerType::const_iterator iterator_; - // A cached value of *iterator_. We keep it here to allow access by - // pointer in the wrapping iterator's operator->(). - // value_ needs to be mutable to be accessed in Current(). - // Use of scoped_ptr helps manage cached value's lifetime, - // which is bound by the lifespan of the iterator itself. - mutable scoped_ptr value_; - }; // class ValuesInIteratorRangeGenerator::Iterator - - // No implementation - assignment is unsupported. - void operator=(const ValuesInIteratorRangeGenerator& other); - - const ContainerType container_; -}; // class ValuesInIteratorRangeGenerator - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Stores a parameter value and later creates tests parameterized with that -// value. -template -class ParameterizedTestFactory : public TestFactoryBase { - public: - typedef typename TestClass::ParamType ParamType; - explicit ParameterizedTestFactory(ParamType parameter) : - parameter_(parameter) {} - virtual Test* CreateTest() { - TestClass::SetParam(¶meter_); - return new TestClass(); - } - - private: - const ParamType parameter_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); -}; - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// TestMetaFactoryBase is a base class for meta-factories that create -// test factories for passing into MakeAndRegisterTestInfo function. -template -class TestMetaFactoryBase { - public: - virtual ~TestMetaFactoryBase() {} - - virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0; -}; - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// TestMetaFactory creates test factories for passing into -// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives -// ownership of test factory pointer, same factory object cannot be passed -// into that method twice. But ParameterizedTestCaseInfo is going to call -// it for each Test/Parameter value combination. Thus it needs meta factory -// creator class. -template -class TestMetaFactory - : public TestMetaFactoryBase { - public: - typedef typename TestCase::ParamType ParamType; - - TestMetaFactory() {} - - virtual TestFactoryBase* CreateTestFactory(ParamType parameter) { - return new ParameterizedTestFactory(parameter); - } - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); -}; - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// ParameterizedTestCaseInfoBase is a generic interface -// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase -// accumulates test information provided by TEST_P macro invocations -// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations -// and uses that information to register all resulting test instances -// in RegisterTests method. The ParameterizeTestCaseRegistry class holds -// a collection of pointers to the ParameterizedTestCaseInfo objects -// and calls RegisterTests() on each of them when asked. -class ParameterizedTestCaseInfoBase { - public: - virtual ~ParameterizedTestCaseInfoBase() {} - - // Base part of test case name for display purposes. - virtual const string& GetTestCaseName() const = 0; - // Test case id to verify identity. - virtual TypeId GetTestCaseTypeId() const = 0; - // UnitTest class invokes this method to register tests in this - // test case right before running them in RUN_ALL_TESTS macro. - // This method should not be called more then once on any single - // instance of a ParameterizedTestCaseInfoBase derived class. - virtual void RegisterTests() = 0; - - protected: - ParameterizedTestCaseInfoBase() {} - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase); -}; - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P -// macro invocations for a particular test case and generators -// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that -// test case. It registers tests with all values generated by all -// generators when asked. -template -class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { - public: - // ParamType and GeneratorCreationFunc are private types but are required - // for declarations of public methods AddTestPattern() and - // AddTestCaseInstantiation(). - typedef typename TestCase::ParamType ParamType; - // A function that returns an instance of appropriate generator type. - typedef ParamGenerator(GeneratorCreationFunc)(); - - explicit ParameterizedTestCaseInfo(const char* name) - : test_case_name_(name) {} - - // Test case base name for display purposes. - virtual const string& GetTestCaseName() const { return test_case_name_; } - // Test case id to verify identity. - virtual TypeId GetTestCaseTypeId() const { return GetTypeId(); } - // TEST_P macro uses AddTestPattern() to record information - // about a single test in a LocalTestInfo structure. - // test_case_name is the base name of the test case (without invocation - // prefix). test_base_name is the name of an individual test without - // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is - // test case base name and DoBar is test base name. - void AddTestPattern(const char* test_case_name, - const char* test_base_name, - TestMetaFactoryBase* meta_factory) { - tests_.push_back(linked_ptr(new TestInfo(test_case_name, - test_base_name, - meta_factory))); - } - // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information - // about a generator. - int AddTestCaseInstantiation(const string& instantiation_name, - GeneratorCreationFunc* func, - const char* /* file */, - int /* line */) { - instantiations_.push_back(::std::make_pair(instantiation_name, func)); - return 0; // Return value used only to run this method in namespace scope. - } - // UnitTest class invokes this method to register tests in this test case - // test cases right before running tests in RUN_ALL_TESTS macro. - // This method should not be called more then once on any single - // instance of a ParameterizedTestCaseInfoBase derived class. - // UnitTest has a guard to prevent from calling this method more then once. - virtual void RegisterTests() { - for (typename TestInfoContainer::iterator test_it = tests_.begin(); - test_it != tests_.end(); ++test_it) { - linked_ptr test_info = *test_it; - for (typename InstantiationContainer::iterator gen_it = - instantiations_.begin(); gen_it != instantiations_.end(); - ++gen_it) { - const string& instantiation_name = gen_it->first; - ParamGenerator generator((*gen_it->second)()); - - string test_case_name; - if ( !instantiation_name.empty() ) - test_case_name = instantiation_name + "/"; - test_case_name += test_info->test_case_base_name; - - int i = 0; - for (typename ParamGenerator::iterator param_it = - generator.begin(); - param_it != generator.end(); ++param_it, ++i) { - Message test_name_stream; - test_name_stream << test_info->test_base_name << "/" << i; - MakeAndRegisterTestInfo( - test_case_name.c_str(), - test_name_stream.GetString().c_str(), - NULL, // No type parameter. - PrintToString(*param_it).c_str(), - GetTestCaseTypeId(), - TestCase::SetUpTestCase, - TestCase::TearDownTestCase, - test_info->test_meta_factory->CreateTestFactory(*param_it)); - } // for param_it - } // for gen_it - } // for test_it - } // RegisterTests - - private: - // LocalTestInfo structure keeps information about a single test registered - // with TEST_P macro. - struct TestInfo { - TestInfo(const char* a_test_case_base_name, - const char* a_test_base_name, - TestMetaFactoryBase* a_test_meta_factory) : - test_case_base_name(a_test_case_base_name), - test_base_name(a_test_base_name), - test_meta_factory(a_test_meta_factory) {} - - const string test_case_base_name; - const string test_base_name; - const scoped_ptr > test_meta_factory; - }; - typedef ::std::vector > TestInfoContainer; - // Keeps pairs of - // received from INSTANTIATE_TEST_CASE_P macros. - typedef ::std::vector > - InstantiationContainer; - - const string test_case_name_; - TestInfoContainer tests_; - InstantiationContainer instantiations_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo); -}; // class ParameterizedTestCaseInfo - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase -// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P -// macros use it to locate their corresponding ParameterizedTestCaseInfo -// descriptors. -class ParameterizedTestCaseRegistry { - public: - ParameterizedTestCaseRegistry() {} - ~ParameterizedTestCaseRegistry() { - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - delete *it; - } - } - - // Looks up or creates and returns a structure containing information about - // tests and instantiations of a particular test case. - template - ParameterizedTestCaseInfo* GetTestCasePatternHolder( - const char* test_case_name, - const char* file, - int line) { - ParameterizedTestCaseInfo* typed_test_info = NULL; - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - if ((*it)->GetTestCaseName() == test_case_name) { - if ((*it)->GetTestCaseTypeId() != GetTypeId()) { - // Complain about incorrect usage of Google Test facilities - // and terminate the program since we cannot guaranty correct - // test case setup and tear-down in this case. - ReportInvalidTestCaseType(test_case_name, file, line); - posix::Abort(); - } else { - // At this point we are sure that the object we found is of the same - // type we are looking for, so we downcast it to that type - // without further checks. - typed_test_info = CheckedDowncastToActualType< - ParameterizedTestCaseInfo >(*it); - } - break; - } - } - if (typed_test_info == NULL) { - typed_test_info = new ParameterizedTestCaseInfo(test_case_name); - test_case_infos_.push_back(typed_test_info); - } - return typed_test_info; - } - void RegisterTests() { - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - (*it)->RegisterTests(); - } - } - - private: - typedef ::std::vector TestCaseInfoContainer; - - TestCaseInfoContainer test_case_infos_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry); -}; - -} // namespace internal -} // namespace testing - -#endif // GTEST_HAS_PARAM_TEST - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ -// This file was GENERATED by command: -// pump.py gtest-param-util-generated.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: vladl@google.com (Vlad Losev) - -// Type and function utilities for implementing parameterized tests. -// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// Currently Google Test supports at most 50 arguments in Values, -// and at most 10 arguments in Combine. Please contact -// googletestframework@googlegroups.com if you need more. -// Please note that the number of arguments to Combine is limited -// by the maximum arity of the implementation of tr1::tuple which is -// currently set at 10. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ - -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. - -#if GTEST_HAS_PARAM_TEST - -namespace testing { - -// Forward declarations of ValuesIn(), which is implemented in -// include/gtest/gtest-param-test.h. -template -internal::ParamGenerator< - typename ::testing::internal::IteratorTraits::value_type> -ValuesIn(ForwardIterator begin, ForwardIterator end); - -template -internal::ParamGenerator ValuesIn(const T (&array)[N]); - -template -internal::ParamGenerator ValuesIn( - const Container& container); - -namespace internal { - -// Used in the Values() function to provide polymorphic capabilities. -template -class ValueArray1 { - public: - explicit ValueArray1(T1 v1) : v1_(v1) {} - - template - operator ParamGenerator() const { return ValuesIn(&v1_, &v1_ + 1); } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray1& other); - - const T1 v1_; -}; - -template -class ValueArray2 { - public: - ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray2& other); - - const T1 v1_; - const T2 v2_; -}; - -template -class ValueArray3 { - public: - ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray3& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; -}; - -template -class ValueArray4 { - public: - ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray4& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; -}; - -template -class ValueArray5 { - public: - ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray5& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; -}; - -template -class ValueArray6 { - public: - ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray6& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; -}; - -template -class ValueArray7 { - public: - ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray7& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; -}; - -template -class ValueArray8 { - public: - ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray8& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; -}; - -template -class ValueArray9 { - public: - ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray9& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; -}; - -template -class ValueArray10 { - public: - ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray10& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; -}; - -template -class ValueArray11 { - public: - ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray11& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; -}; - -template -class ValueArray12 { - public: - ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray12& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; -}; - -template -class ValueArray13 { - public: - ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray13& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; -}; - -template -class ValueArray14 { - public: - ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray14& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; -}; - -template -class ValueArray15 { - public: - ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray15& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; -}; - -template -class ValueArray16 { - public: - ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray16& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; -}; - -template -class ValueArray17 { - public: - ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray17& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; -}; - -template -class ValueArray18 { - public: - ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray18& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; -}; - -template -class ValueArray19 { - public: - ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray19& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; -}; - -template -class ValueArray20 { - public: - ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray20& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; -}; - -template -class ValueArray21 { - public: - ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray21& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; -}; - -template -class ValueArray22 { - public: - ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray22& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; -}; - -template -class ValueArray23 { - public: - ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray23& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; -}; - -template -class ValueArray24 { - public: - ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray24& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; -}; - -template -class ValueArray25 { - public: - ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray25& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; -}; - -template -class ValueArray26 { - public: - ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray26& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; -}; - -template -class ValueArray27 { - public: - ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray27& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; -}; - -template -class ValueArray28 { - public: - ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray28& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; -}; - -template -class ValueArray29 { - public: - ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray29& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; -}; - -template -class ValueArray30 { - public: - ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray30& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; -}; - -template -class ValueArray31 { - public: - ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray31& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; -}; - -template -class ValueArray32 { - public: - ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray32& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; -}; - -template -class ValueArray33 { - public: - ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, - T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray33& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; -}; - -template -class ValueArray34 { - public: - ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray34& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; -}; - -template -class ValueArray35 { - public: - ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), - v32_(v32), v33_(v33), v34_(v34), v35_(v35) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray35& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; -}; - -template -class ValueArray36 { - public: - ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), - v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray36& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; -}; - -template -class ValueArray37 { - public: - ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), - v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), - v36_(v36), v37_(v37) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray37& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; -}; - -template -class ValueArray38 { - public: - ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray38& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; -}; - -template -class ValueArray39 { - public: - ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray39& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; -}; - -template -class ValueArray40 { - public: - ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), - v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), - v40_(v40) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray40& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; -}; - -template -class ValueArray41 { - public: - ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, - T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray41& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; -}; - -template -class ValueArray42 { - public: - ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray42& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; -}; - -template -class ValueArray43 { - public: - ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), - v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), - v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray43& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; -}; - -template -class ValueArray44 { - public: - ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), - v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), - v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), - v43_(v43), v44_(v44) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray44& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; -}; - -template -class ValueArray45 { - public: - ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), - v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), - v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), - v42_(v42), v43_(v43), v44_(v44), v45_(v45) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray45& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; -}; - -template -class ValueArray46 { - public: - ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), - v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray46& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; -}; - -template -class ValueArray47 { - public: - ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), - v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46), - v47_(v47) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray47& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; -}; - -template -class ValueArray48 { - public: - ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), - v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), - v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), - v46_(v46), v47_(v47), v48_(v48) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray48& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; -}; - -template -class ValueArray49 { - public: - ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, - T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), - v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_), static_cast(v49_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray49& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; - const T49 v49_; -}; - -template -class ValueArray50 { - public: - ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49, - T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), - v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_), static_cast(v49_), static_cast(v50_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray50& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; - const T49 v49_; - const T50 v50_; -}; - -# if GTEST_HAS_COMBINE -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Generates values from the Cartesian product of values produced -// by the argument generators. -// -template -class CartesianProductGenerator2 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator2(const ParamGenerator& g1, - const ParamGenerator& g2) - : g1_(g1), g2_(g2) {} - virtual ~CartesianProductGenerator2() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current2_; - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - ParamType current_value_; - }; // class CartesianProductGenerator2::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator2& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; -}; // class CartesianProductGenerator2 - - -template -class CartesianProductGenerator3 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator3(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3) - : g1_(g1), g2_(g2), g3_(g3) {} - virtual ~CartesianProductGenerator3() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current3_; - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - ParamType current_value_; - }; // class CartesianProductGenerator3::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator3& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; -}; // class CartesianProductGenerator3 - - -template -class CartesianProductGenerator4 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator4(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} - virtual ~CartesianProductGenerator4() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current4_; - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - ParamType current_value_; - }; // class CartesianProductGenerator4::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator4& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; -}; // class CartesianProductGenerator4 - - -template -class CartesianProductGenerator5 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator5(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} - virtual ~CartesianProductGenerator5() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current5_; - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - ParamType current_value_; - }; // class CartesianProductGenerator5::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator5& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; -}; // class CartesianProductGenerator5 - - -template -class CartesianProductGenerator6 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator6(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} - virtual ~CartesianProductGenerator6() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current6_; - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - ParamType current_value_; - }; // class CartesianProductGenerator6::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator6& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; -}; // class CartesianProductGenerator6 - - -template -class CartesianProductGenerator7 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator7(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} - virtual ~CartesianProductGenerator7() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current7_; - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - ParamType current_value_; - }; // class CartesianProductGenerator7::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator7& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; -}; // class CartesianProductGenerator7 - - -template -class CartesianProductGenerator8 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator8(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), - g8_(g8) {} - virtual ~CartesianProductGenerator8() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current8_; - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - ParamType current_value_; - }; // class CartesianProductGenerator8::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator8& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; -}; // class CartesianProductGenerator8 - - -template -class CartesianProductGenerator9 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator9(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8, const ParamGenerator& g9) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9) {} - virtual ~CartesianProductGenerator9() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end(), g9_, g9_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8, - const ParamGenerator& g9, - const typename ParamGenerator::iterator& current9) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8), - begin9_(g9.begin()), end9_(g9.end()), current9_(current9) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current9_; - if (current9_ == end9_) { - current9_ = begin9_; - ++current8_; - } - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_ && - current9_ == typed_other->current9_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_), - begin9_(other.begin9_), - end9_(other.end9_), - current9_(other.current9_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_, - *current9_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_ || - current9_ == end9_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - const typename ParamGenerator::iterator begin9_; - const typename ParamGenerator::iterator end9_; - typename ParamGenerator::iterator current9_; - ParamType current_value_; - }; // class CartesianProductGenerator9::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator9& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; - const ParamGenerator g9_; -}; // class CartesianProductGenerator9 - - -template -class CartesianProductGenerator10 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator10(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8, const ParamGenerator& g9, - const ParamGenerator& g10) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9), g10_(g10) {} - virtual ~CartesianProductGenerator10() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end(), g9_, g9_.end(), g10_, g10_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8, - const ParamGenerator& g9, - const typename ParamGenerator::iterator& current9, - const ParamGenerator& g10, - const typename ParamGenerator::iterator& current10) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8), - begin9_(g9.begin()), end9_(g9.end()), current9_(current9), - begin10_(g10.begin()), end10_(g10.end()), current10_(current10) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current10_; - if (current10_ == end10_) { - current10_ = begin10_; - ++current9_; - } - if (current9_ == end9_) { - current9_ = begin9_; - ++current8_; - } - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_ && - current9_ == typed_other->current9_ && - current10_ == typed_other->current10_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_), - begin9_(other.begin9_), - end9_(other.end9_), - current9_(other.current9_), - begin10_(other.begin10_), - end10_(other.end10_), - current10_(other.current10_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_, - *current9_, *current10_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_ || - current9_ == end9_ || - current10_ == end10_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - const typename ParamGenerator::iterator begin9_; - const typename ParamGenerator::iterator end9_; - typename ParamGenerator::iterator current9_; - const typename ParamGenerator::iterator begin10_; - const typename ParamGenerator::iterator end10_; - typename ParamGenerator::iterator current10_; - ParamType current_value_; - }; // class CartesianProductGenerator10::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator10& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; - const ParamGenerator g9_; - const ParamGenerator g10_; -}; // class CartesianProductGenerator10 - - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Helper classes providing Combine() with polymorphic features. They allow -// casting CartesianProductGeneratorN to ParamGenerator if T is -// convertible to U. -// -template -class CartesianProductHolder2 { - public: -CartesianProductHolder2(const Generator1& g1, const Generator2& g2) - : g1_(g1), g2_(g2) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator2( - static_cast >(g1_), - static_cast >(g2_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder2& other); - - const Generator1 g1_; - const Generator2 g2_; -}; // class CartesianProductHolder2 - -template -class CartesianProductHolder3 { - public: -CartesianProductHolder3(const Generator1& g1, const Generator2& g2, - const Generator3& g3) - : g1_(g1), g2_(g2), g3_(g3) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator3( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder3& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; -}; // class CartesianProductHolder3 - -template -class CartesianProductHolder4 { - public: -CartesianProductHolder4(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator4( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder4& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; -}; // class CartesianProductHolder4 - -template -class CartesianProductHolder5 { - public: -CartesianProductHolder5(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator5( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder5& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; -}; // class CartesianProductHolder5 - -template -class CartesianProductHolder6 { - public: -CartesianProductHolder6(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator6( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder6& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; -}; // class CartesianProductHolder6 - -template -class CartesianProductHolder7 { - public: -CartesianProductHolder7(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator7( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder7& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; -}; // class CartesianProductHolder7 - -template -class CartesianProductHolder8 { - public: -CartesianProductHolder8(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), - g8_(g8) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator8( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder8& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; -}; // class CartesianProductHolder8 - -template -class CartesianProductHolder9 { - public: -CartesianProductHolder9(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8, - const Generator9& g9) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator9( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_), - static_cast >(g9_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder9& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; - const Generator9 g9_; -}; // class CartesianProductHolder9 - -template -class CartesianProductHolder10 { - public: -CartesianProductHolder10(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8, - const Generator9& g9, const Generator10& g10) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9), g10_(g10) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator10( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_), - static_cast >(g9_), - static_cast >(g10_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder10& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; - const Generator9 g9_; - const Generator10 g10_; -}; // class CartesianProductHolder10 - -# endif // GTEST_HAS_COMBINE - -} // namespace internal -} // namespace testing - -#endif // GTEST_HAS_PARAM_TEST - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ - -#if GTEST_HAS_PARAM_TEST - -namespace testing { - -// Functions producing parameter generators. -// -// Google Test uses these generators to produce parameters for value- -// parameterized tests. When a parameterized test case is instantiated -// with a particular generator, Google Test creates and runs tests -// for each element in the sequence produced by the generator. -// -// In the following sample, tests from test case FooTest are instantiated -// each three times with parameter values 3, 5, and 8: -// -// class FooTest : public TestWithParam { ... }; -// -// TEST_P(FooTest, TestThis) { -// } -// TEST_P(FooTest, TestThat) { -// } -// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); -// - -// Range() returns generators providing sequences of values in a range. -// -// Synopsis: -// Range(start, end) -// - returns a generator producing a sequence of values {start, start+1, -// start+2, ..., }. -// Range(start, end, step) -// - returns a generator producing a sequence of values {start, start+step, -// start+step+step, ..., }. -// Notes: -// * The generated sequences never include end. For example, Range(1, 5) -// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) -// returns a generator producing {1, 3, 5, 7}. -// * start and end must have the same type. That type may be any integral or -// floating-point type or a user defined type satisfying these conditions: -// * It must be assignable (have operator=() defined). -// * It must have operator+() (operator+(int-compatible type) for -// two-operand version). -// * It must have operator<() defined. -// Elements in the resulting sequences will also have that type. -// * Condition start < end must be satisfied in order for resulting sequences -// to contain any elements. -// -template -internal::ParamGenerator Range(T start, T end, IncrementT step) { - return internal::ParamGenerator( - new internal::RangeGenerator(start, end, step)); -} - -template -internal::ParamGenerator Range(T start, T end) { - return Range(start, end, 1); -} - -// ValuesIn() function allows generation of tests with parameters coming from -// a container. -// -// Synopsis: -// ValuesIn(const T (&array)[N]) -// - returns a generator producing sequences with elements from -// a C-style array. -// ValuesIn(const Container& container) -// - returns a generator producing sequences with elements from -// an STL-style container. -// ValuesIn(Iterator begin, Iterator end) -// - returns a generator producing sequences with elements from -// a range [begin, end) defined by a pair of STL-style iterators. These -// iterators can also be plain C pointers. -// -// Please note that ValuesIn copies the values from the containers -// passed in and keeps them to generate tests in RUN_ALL_TESTS(). -// -// Examples: -// -// This instantiates tests from test case StringTest -// each with C-string values of "foo", "bar", and "baz": -// -// const char* strings[] = {"foo", "bar", "baz"}; -// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings)); -// -// This instantiates tests from test case StlStringTest -// each with STL strings with values "a" and "b": -// -// ::std::vector< ::std::string> GetParameterStrings() { -// ::std::vector< ::std::string> v; -// v.push_back("a"); -// v.push_back("b"); -// return v; -// } -// -// INSTANTIATE_TEST_CASE_P(CharSequence, -// StlStringTest, -// ValuesIn(GetParameterStrings())); -// -// -// This will also instantiate tests from CharTest -// each with parameter values 'a' and 'b': -// -// ::std::list GetParameterChars() { -// ::std::list list; -// list.push_back('a'); -// list.push_back('b'); -// return list; -// } -// ::std::list l = GetParameterChars(); -// INSTANTIATE_TEST_CASE_P(CharSequence2, -// CharTest, -// ValuesIn(l.begin(), l.end())); -// -template -internal::ParamGenerator< - typename ::testing::internal::IteratorTraits::value_type> -ValuesIn(ForwardIterator begin, ForwardIterator end) { - typedef typename ::testing::internal::IteratorTraits - ::value_type ParamType; - return internal::ParamGenerator( - new internal::ValuesInIteratorRangeGenerator(begin, end)); -} - -template -internal::ParamGenerator ValuesIn(const T (&array)[N]) { - return ValuesIn(array, array + N); -} - -template -internal::ParamGenerator ValuesIn( - const Container& container) { - return ValuesIn(container.begin(), container.end()); -} - -// Values() allows generating tests from explicitly specified list of -// parameters. -// -// Synopsis: -// Values(T v1, T v2, ..., T vN) -// - returns a generator producing sequences with elements v1, v2, ..., vN. -// -// For example, this instantiates tests from test case BarTest each -// with values "one", "two", and "three": -// -// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); -// -// This instantiates tests from test case BazTest each with values 1, 2, 3.5. -// The exact type of values will depend on the type of parameter in BazTest. -// -// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); -// -// Currently, Values() supports from 1 to 50 parameters. -// -template -internal::ValueArray1 Values(T1 v1) { - return internal::ValueArray1(v1); -} - -template -internal::ValueArray2 Values(T1 v1, T2 v2) { - return internal::ValueArray2(v1, v2); -} - -template -internal::ValueArray3 Values(T1 v1, T2 v2, T3 v3) { - return internal::ValueArray3(v1, v2, v3); -} - -template -internal::ValueArray4 Values(T1 v1, T2 v2, T3 v3, T4 v4) { - return internal::ValueArray4(v1, v2, v3, v4); -} - -template -internal::ValueArray5 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5) { - return internal::ValueArray5(v1, v2, v3, v4, v5); -} - -template -internal::ValueArray6 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6) { - return internal::ValueArray6(v1, v2, v3, v4, v5, v6); -} - -template -internal::ValueArray7 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7) { - return internal::ValueArray7(v1, v2, v3, v4, v5, - v6, v7); -} - -template -internal::ValueArray8 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) { - return internal::ValueArray8(v1, v2, v3, v4, - v5, v6, v7, v8); -} - -template -internal::ValueArray9 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) { - return internal::ValueArray9(v1, v2, v3, - v4, v5, v6, v7, v8, v9); -} - -template -internal::ValueArray10 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) { - return internal::ValueArray10(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10); -} - -template -internal::ValueArray11 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11) { - return internal::ValueArray11(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11); -} - -template -internal::ValueArray12 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12) { - return internal::ValueArray12(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12); -} - -template -internal::ValueArray13 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13) { - return internal::ValueArray13(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13); -} - -template -internal::ValueArray14 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) { - return internal::ValueArray14(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14); -} - -template -internal::ValueArray15 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) { - return internal::ValueArray15(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15); -} - -template -internal::ValueArray16 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16) { - return internal::ValueArray16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16); -} - -template -internal::ValueArray17 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17) { - return internal::ValueArray17(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17); -} - -template -internal::ValueArray18 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18) { - return internal::ValueArray18(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18); -} - -template -internal::ValueArray19 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) { - return internal::ValueArray19(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19); -} - -template -internal::ValueArray20 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) { - return internal::ValueArray20(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20); -} - -template -internal::ValueArray21 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) { - return internal::ValueArray21(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21); -} - -template -internal::ValueArray22 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22) { - return internal::ValueArray22(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22); -} - -template -internal::ValueArray23 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23) { - return internal::ValueArray23(v1, v2, v3, - v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23); -} - -template -internal::ValueArray24 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24) { - return internal::ValueArray24(v1, v2, - v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, - v19, v20, v21, v22, v23, v24); -} - -template -internal::ValueArray25 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, - T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, - T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) { - return internal::ValueArray25(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, - v18, v19, v20, v21, v22, v23, v24, v25); -} - -template -internal::ValueArray26 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26) { - return internal::ValueArray26(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26); -} - -template -internal::ValueArray27 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27) { - return internal::ValueArray27(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, - v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27); -} - -template -internal::ValueArray28 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28) { - return internal::ValueArray28(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, - v28); -} - -template -internal::ValueArray29 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29) { - return internal::ValueArray29(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, - v27, v28, v29); -} - -template -internal::ValueArray30 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) { - return internal::ValueArray30(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, - v26, v27, v28, v29, v30); -} - -template -internal::ValueArray31 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) { - return internal::ValueArray31(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, - v25, v26, v27, v28, v29, v30, v31); -} - -template -internal::ValueArray32 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32) { - return internal::ValueArray32(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32); -} - -template -internal::ValueArray33 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33) { - return internal::ValueArray33(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33); -} - -template -internal::ValueArray34 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, - T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, - T31 v31, T32 v32, T33 v33, T34 v34) { - return internal::ValueArray34(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, - v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34); -} - -template -internal::ValueArray35 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) { - return internal::ValueArray35(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, - v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35); -} - -template -internal::ValueArray36 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) { - return internal::ValueArray36(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36); -} - -template -internal::ValueArray37 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37) { - return internal::ValueArray37(v1, v2, v3, - v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36, v37); -} - -template -internal::ValueArray38 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37, T38 v38) { - return internal::ValueArray38(v1, v2, - v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, - v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, - v33, v34, v35, v36, v37, v38); -} - -template -internal::ValueArray39 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37, T38 v38, T39 v39) { - return internal::ValueArray39(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, - v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, - v32, v33, v34, v35, v36, v37, v38, v39); -} - -template -internal::ValueArray40 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, - T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, - T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, - T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, - T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) { - return internal::ValueArray40(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, - v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40); -} - -template -internal::ValueArray41 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) { - return internal::ValueArray41(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, - v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, - v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41); -} - -template -internal::ValueArray42 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42) { - return internal::ValueArray42(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, - v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, - v42); -} - -template -internal::ValueArray43 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43) { - return internal::ValueArray43(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, - v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, - v41, v42, v43); -} - -template -internal::ValueArray44 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44) { - return internal::ValueArray44(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, - v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, - v40, v41, v42, v43, v44); -} - -template -internal::ValueArray45 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, - T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, - T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) { - return internal::ValueArray45(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, - v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, - v39, v40, v41, v42, v43, v44, v45); -} - -template -internal::ValueArray46 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) { - return internal::ValueArray46(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, - v38, v39, v40, v41, v42, v43, v44, v45, v46); -} - -template -internal::ValueArray47 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) { - return internal::ValueArray47(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, - v38, v39, v40, v41, v42, v43, v44, v45, v46, v47); -} - -template -internal::ValueArray48 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, - T48 v48) { - return internal::ValueArray48(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, - v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, - v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48); -} - -template -internal::ValueArray49 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, - T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, - T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, - T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, - T47 v47, T48 v48, T49 v49) { - return internal::ValueArray49(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, - v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, - v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49); -} - -template -internal::ValueArray50 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, - T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, - T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) { - return internal::ValueArray50(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, - v48, v49, v50); -} - -// Bool() allows generating tests with parameters in a set of (false, true). -// -// Synopsis: -// Bool() -// - returns a generator producing sequences with elements {false, true}. -// -// It is useful when testing code that depends on Boolean flags. Combinations -// of multiple flags can be tested when several Bool()'s are combined using -// Combine() function. -// -// In the following example all tests in the test case FlagDependentTest -// will be instantiated twice with parameters false and true. -// -// class FlagDependentTest : public testing::TestWithParam { -// virtual void SetUp() { -// external_flag = GetParam(); -// } -// } -// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool()); -// -inline internal::ParamGenerator Bool() { - return Values(false, true); -} - -# if GTEST_HAS_COMBINE -// Combine() allows the user to combine two or more sequences to produce -// values of a Cartesian product of those sequences' elements. -// -// Synopsis: -// Combine(gen1, gen2, ..., genN) -// - returns a generator producing sequences with elements coming from -// the Cartesian product of elements from the sequences generated by -// gen1, gen2, ..., genN. The sequence elements will have a type of -// tuple where T1, T2, ..., TN are the types -// of elements from sequences produces by gen1, gen2, ..., genN. -// -// Combine can have up to 10 arguments. This number is currently limited -// by the maximum number of elements in the tuple implementation used by Google -// Test. -// -// Example: -// -// This will instantiate tests in test case AnimalTest each one with -// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), -// tuple("dog", BLACK), and tuple("dog", WHITE): -// -// enum Color { BLACK, GRAY, WHITE }; -// class AnimalTest -// : public testing::TestWithParam > {...}; -// -// TEST_P(AnimalTest, AnimalLooksNice) {...} -// -// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, -// Combine(Values("cat", "dog"), -// Values(BLACK, WHITE))); -// -// This will instantiate tests in FlagDependentTest with all variations of two -// Boolean flags: -// -// class FlagDependentTest -// : public testing::TestWithParam > { -// virtual void SetUp() { -// // Assigns external_flag_1 and external_flag_2 values from the tuple. -// tie(external_flag_1, external_flag_2) = GetParam(); -// } -// }; -// -// TEST_P(FlagDependentTest, TestFeature1) { -// // Test your code using external_flag_1 and external_flag_2 here. -// } -// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, -// Combine(Bool(), Bool())); -// -template -internal::CartesianProductHolder2 Combine( - const Generator1& g1, const Generator2& g2) { - return internal::CartesianProductHolder2( - g1, g2); -} - -template -internal::CartesianProductHolder3 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3) { - return internal::CartesianProductHolder3( - g1, g2, g3); -} - -template -internal::CartesianProductHolder4 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4) { - return internal::CartesianProductHolder4( - g1, g2, g3, g4); -} - -template -internal::CartesianProductHolder5 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5) { - return internal::CartesianProductHolder5( - g1, g2, g3, g4, g5); -} - -template -internal::CartesianProductHolder6 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6) { - return internal::CartesianProductHolder6( - g1, g2, g3, g4, g5, g6); -} - -template -internal::CartesianProductHolder7 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7) { - return internal::CartesianProductHolder7( - g1, g2, g3, g4, g5, g6, g7); -} - -template -internal::CartesianProductHolder8 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8) { - return internal::CartesianProductHolder8( - g1, g2, g3, g4, g5, g6, g7, g8); -} - -template -internal::CartesianProductHolder9 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8, const Generator9& g9) { - return internal::CartesianProductHolder9( - g1, g2, g3, g4, g5, g6, g7, g8, g9); -} - -template -internal::CartesianProductHolder10 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8, const Generator9& g9, - const Generator10& g10) { - return internal::CartesianProductHolder10( - g1, g2, g3, g4, g5, g6, g7, g8, g9, g10); -} -# endif // GTEST_HAS_COMBINE - - - -# define TEST_P(test_case_name, test_name) \ - class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ - : public test_case_name { \ - public: \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ - virtual void TestBody(); \ - private: \ - static int AddToRegistry() { \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, __FILE__, __LINE__)->AddTestPattern(\ - #test_case_name, \ - #test_name, \ - new ::testing::internal::TestMetaFactory< \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \ - return 0; \ - } \ - static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ - }; \ - int GTEST_TEST_CLASS_NAME_(test_case_name, \ - test_name)::gtest_registering_dummy_ = \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ - void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() - -# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \ - ::testing::internal::ParamGenerator \ - gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ - int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\ - #prefix, \ - >est_##prefix##test_case_name##_EvalGenerator_, \ - __FILE__, __LINE__) - -} // namespace testing - -#endif // GTEST_HAS_PARAM_TEST - -#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ -// Copyright 2006, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// Google C++ Testing Framework definitions useful in production code. - -#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ - -// When you need to test the private or protected members of a class, -// use the FRIEND_TEST macro to declare your tests as friends of the -// class. For example: -// -// class MyClass { -// private: -// void MyMethod(); -// FRIEND_TEST(MyClassTest, MyMethod); -// }; -// -// class MyClassTest : public testing::Test { -// // ... -// }; -// -// TEST_F(MyClassTest, MyMethod) { -// // Can call MyClass::MyMethod() here. -// } - -#define FRIEND_TEST(test_case_name, test_name)\ -friend class test_case_name##_##test_name##_Test - -#endif // GTEST_INCLUDE_GTEST_GTEST_PROD_H_ -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: mheule@google.com (Markus Heule) -// - -#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ -#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ - -#include -#include - -namespace testing { - -// A copyable object representing the result of a test part (i.e. an -// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). -// -// Don't inherit from TestPartResult as its destructor is not virtual. -class GTEST_API_ TestPartResult { - public: - // The possible outcomes of a test part (i.e. an assertion or an - // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). - enum Type { - kSuccess, // Succeeded. - kNonFatalFailure, // Failed but the test can continue. - kFatalFailure // Failed and the test should be terminated. - }; - - // C'tor. TestPartResult does NOT have a default constructor. - // Always use this constructor (with parameters) to create a - // TestPartResult object. - TestPartResult(Type a_type, - const char* a_file_name, - int a_line_number, - const char* a_message) - : type_(a_type), - file_name_(a_file_name == NULL ? "" : a_file_name), - line_number_(a_line_number), - summary_(ExtractSummary(a_message)), - message_(a_message) { - } - - // Gets the outcome of the test part. - Type type() const { return type_; } - - // Gets the name of the source file where the test part took place, or - // NULL if it's unknown. - const char* file_name() const { - return file_name_.empty() ? NULL : file_name_.c_str(); - } - - // Gets the line in the source file where the test part took place, - // or -1 if it's unknown. - int line_number() const { return line_number_; } - - // Gets the summary of the failure message. - const char* summary() const { return summary_.c_str(); } - - // Gets the message associated with the test part. - const char* message() const { return message_.c_str(); } - - // Returns true iff the test part passed. - bool passed() const { return type_ == kSuccess; } - - // Returns true iff the test part failed. - bool failed() const { return type_ != kSuccess; } - - // Returns true iff the test part non-fatally failed. - bool nonfatally_failed() const { return type_ == kNonFatalFailure; } - - // Returns true iff the test part fatally failed. - bool fatally_failed() const { return type_ == kFatalFailure; } - - private: - Type type_; - - // Gets the summary of the failure message by omitting the stack - // trace in it. - static std::string ExtractSummary(const char* message); - - // The name of the source file where the test part took place, or - // "" if the source file is unknown. - std::string file_name_; - // The line in the source file where the test part took place, or -1 - // if the line number is unknown. - int line_number_; - std::string summary_; // The test failure summary. - std::string message_; // The test failure message. -}; - -// Prints a TestPartResult object. -std::ostream& operator<<(std::ostream& os, const TestPartResult& result); - -// An array of TestPartResult objects. -// -// Don't inherit from TestPartResultArray as its destructor is not -// virtual. -class GTEST_API_ TestPartResultArray { - public: - TestPartResultArray() {} - - // Appends the given TestPartResult to the array. - void Append(const TestPartResult& result); - - // Returns the TestPartResult at the given index (0-based). - const TestPartResult& GetTestPartResult(int index) const; - - // Returns the number of TestPartResult objects in the array. - int size() const; - - private: - std::vector array_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); -}; - -// This interface knows how to report a test part result. -class TestPartResultReporterInterface { - public: - virtual ~TestPartResultReporterInterface() {} - - virtual void ReportTestPartResult(const TestPartResult& result) = 0; -}; - -namespace internal { - -// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a -// statement generates new fatal failures. To do so it registers itself as the -// current test part result reporter. Besides checking if fatal failures were -// reported, it only delegates the reporting to the former result reporter. -// The original result reporter is restored in the destructor. -// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -class GTEST_API_ HasNewFatalFailureHelper - : public TestPartResultReporterInterface { - public: - HasNewFatalFailureHelper(); - virtual ~HasNewFatalFailureHelper(); - virtual void ReportTestPartResult(const TestPartResult& result); - bool has_new_fatal_failure() const { return has_new_fatal_failure_; } - private: - bool has_new_fatal_failure_; - TestPartResultReporterInterface* original_reporter_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); -}; - -} // namespace internal - -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ - -// This header implements typed tests and type-parameterized tests. - -// Typed (aka type-driven) tests repeat the same test for types in a -// list. You must know which types you want to test with when writing -// typed tests. Here's how you do it: - -#if 0 - -// First, define a fixture class template. It should be parameterized -// by a type. Remember to derive it from testing::Test. -template -class FooTest : public testing::Test { - public: - ... - typedef std::list List; - static T shared_; - T value_; -}; - -// Next, associate a list of types with the test case, which will be -// repeated for each type in the list. The typedef is necessary for -// the macro to parse correctly. -typedef testing::Types MyTypes; -TYPED_TEST_CASE(FooTest, MyTypes); - -// If the type list contains only one type, you can write that type -// directly without Types<...>: -// TYPED_TEST_CASE(FooTest, int); - -// Then, use TYPED_TEST() instead of TEST_F() to define as many typed -// tests for this test case as you want. -TYPED_TEST(FooTest, DoesBlah) { - // Inside a test, refer to TypeParam to get the type parameter. - // Since we are inside a derived class template, C++ requires use to - // visit the members of FooTest via 'this'. - TypeParam n = this->value_; - - // To visit static members of the fixture, add the TestFixture:: - // prefix. - n += TestFixture::shared_; - - // To refer to typedefs in the fixture, add the "typename - // TestFixture::" prefix. - typename TestFixture::List values; - values.push_back(n); - ... -} - -TYPED_TEST(FooTest, HasPropertyA) { ... } - -#endif // 0 - -// Type-parameterized tests are abstract test patterns parameterized -// by a type. Compared with typed tests, type-parameterized tests -// allow you to define the test pattern without knowing what the type -// parameters are. The defined pattern can be instantiated with -// different types any number of times, in any number of translation -// units. -// -// If you are designing an interface or concept, you can define a -// suite of type-parameterized tests to verify properties that any -// valid implementation of the interface/concept should have. Then, -// each implementation can easily instantiate the test suite to verify -// that it conforms to the requirements, without having to write -// similar tests repeatedly. Here's an example: - -#if 0 - -// First, define a fixture class template. It should be parameterized -// by a type. Remember to derive it from testing::Test. -template -class FooTest : public testing::Test { - ... -}; - -// Next, declare that you will define a type-parameterized test case -// (the _P suffix is for "parameterized" or "pattern", whichever you -// prefer): -TYPED_TEST_CASE_P(FooTest); - -// Then, use TYPED_TEST_P() to define as many type-parameterized tests -// for this type-parameterized test case as you want. -TYPED_TEST_P(FooTest, DoesBlah) { - // Inside a test, refer to TypeParam to get the type parameter. - TypeParam n = 0; - ... -} - -TYPED_TEST_P(FooTest, HasPropertyA) { ... } - -// Now the tricky part: you need to register all test patterns before -// you can instantiate them. The first argument of the macro is the -// test case name; the rest are the names of the tests in this test -// case. -REGISTER_TYPED_TEST_CASE_P(FooTest, - DoesBlah, HasPropertyA); - -// Finally, you are free to instantiate the pattern with the types you -// want. If you put the above code in a header file, you can #include -// it in multiple C++ source files and instantiate it multiple times. -// -// To distinguish different instances of the pattern, the first -// argument to the INSTANTIATE_* macro is a prefix that will be added -// to the actual test case name. Remember to pick unique prefixes for -// different instances. -typedef testing::Types MyTypes; -INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); - -// If the type list contains only one type, you can write that type -// directly without Types<...>: -// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int); - -#endif // 0 - - -// Implements typed tests. - -#if GTEST_HAS_TYPED_TEST - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Expands to the name of the typedef for the type parameters of the -// given test case. -# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_ - -// The 'Types' template argument below must have spaces around it -// since some compilers may choke on '>>' when passing a template -// instance (e.g. Types) -# define TYPED_TEST_CASE(CaseName, Types) \ - typedef ::testing::internal::TypeList< Types >::type \ - GTEST_TYPE_PARAMS_(CaseName) - -# define TYPED_TEST(CaseName, TestName) \ - template \ - class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ - : public CaseName { \ - private: \ - typedef CaseName TestFixture; \ - typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ - }; \ - bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTest< \ - CaseName, \ - ::testing::internal::TemplateSel< \ - GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \ - GTEST_TYPE_PARAMS_(CaseName)>::Register(\ - "", #CaseName, #TestName, 0); \ - template \ - void GTEST_TEST_CLASS_NAME_(CaseName, TestName)::TestBody() - -#endif // GTEST_HAS_TYPED_TEST - -// Implements type-parameterized tests. - -#if GTEST_HAS_TYPED_TEST_P - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Expands to the namespace name that the type-parameterized tests for -// the given type-parameterized test case are defined in. The exact -// name of the namespace is subject to change without notice. -# define GTEST_CASE_NAMESPACE_(TestCaseName) \ - gtest_case_##TestCaseName##_ - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Expands to the name of the variable used to remember the names of -// the defined tests in the given test case. -# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \ - gtest_typed_test_case_p_state_##TestCaseName##_ - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. -// -// Expands to the name of the variable used to remember the names of -// the registered tests in the given test case. -# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \ - gtest_registered_test_names_##TestCaseName##_ - -// The variables defined in the type-parameterized test macros are -// static as typically these macros are used in a .h file that can be -// #included in multiple translation units linked together. -# define TYPED_TEST_CASE_P(CaseName) \ - static ::testing::internal::TypedTestCasePState \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName) - -# define TYPED_TEST_P(CaseName, TestName) \ - namespace GTEST_CASE_NAMESPACE_(CaseName) { \ - template \ - class TestName : public CaseName { \ - private: \ - typedef CaseName TestFixture; \ - typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ - }; \ - static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\ - __FILE__, __LINE__, #CaseName, #TestName); \ - } \ - template \ - void GTEST_CASE_NAMESPACE_(CaseName)::TestName::TestBody() - -# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \ - namespace GTEST_CASE_NAMESPACE_(CaseName) { \ - typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ - } \ - static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\ - __FILE__, __LINE__, #__VA_ARGS__) - -// The 'Types' template argument below must have spaces around it -// since some compilers may choke on '>>' when passing a template -// instance (e.g. Types) -# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \ - bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTestCase::type>::Register(\ - #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName)) - -#endif // GTEST_HAS_TYPED_TEST_P - -#endif // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" +#include "gtest/gtest-death-test.h" +#include "gtest/gtest-message.h" +#include "gtest/gtest-param-test.h" +#include "gtest/gtest-printers.h" +#include "gtest/gtest_prod.h" +#include "gtest/gtest-test-part.h" +#include "gtest/gtest-typed-test.h" // Depending on the platform, different string classes are available. // On Linux, in addition to ::std::string, Google also makes use of // class ::string, which has the same interface as ::std::string, but // has a different implementation. // -// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that +// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that // ::string is available AND is a distinct type to ::std::string, or // define it to 0 to indicate otherwise. // -// If the user's ::std::string and ::string are the same class due to -// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0. +// If ::std::string and ::string are the same class on your platform +// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0. // -// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined +// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined // heuristically. namespace testing { @@ -17671,8 +258,31 @@ class GTEST_API_ AssertionResult { // Copy constructor. // Used in EXPECT_TRUE/FALSE(assertion_result). AssertionResult(const AssertionResult& other); + + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) + // Used in the EXPECT_TRUE/FALSE(bool_expression). - explicit AssertionResult(bool success) : success_(success) {} + // + // T must be contextually convertible to bool. + // + // The second parameter prevents this overload from being considered if + // the argument is implicitly convertible to AssertionResult. In that case + // we want AssertionResult's copy constructor to be used. + template + explicit AssertionResult( + const T& success, + typename internal::EnableIf< + !internal::ImplicitlyConvertible::value>::type* + /*enabler*/ = NULL) + : success_(success) {} + + GTEST_DISABLE_MSC_WARNINGS_POP_() + + // Assignment operator. + AssertionResult& operator=(AssertionResult other) { + swap(other); + return *this; + } // Returns true iff the assertion succeeded. operator bool() const { return success_; } // NOLINT @@ -17713,6 +323,9 @@ class GTEST_API_ AssertionResult { message_->append(a_message.GetString().c_str()); } + // Swap the contents of this AssertionResult with other. + void swap(AssertionResult& other); + // Stores result of the assertion predicate. bool success_; // Stores the message describing the condition in case the expectation @@ -17720,8 +333,6 @@ class GTEST_API_ AssertionResult { // Referenced via a pointer to avoid taking too much stack frame space // with test assertions. internal::scoped_ptr< ::std::string> message_; - - GTEST_DISALLOW_ASSIGN_(AssertionResult); }; // Makes a successful assertion result. @@ -17748,8 +359,8 @@ GTEST_API_ AssertionResult AssertionFailure(const Message& msg); // // class FooTest : public testing::Test { // protected: -// virtual void SetUp() { ... } -// virtual void TearDown() { ... } +// void SetUp() override { ... } +// void TearDown() override { ... } // ... // }; // @@ -17841,20 +452,19 @@ class GTEST_API_ Test { // internal method to avoid clashing with names used in user TESTs. void DeleteSelf_() { delete this; } - // Uses a GTestFlagSaver to save and restore all Google Test flags. - const internal::GTestFlagSaver* const gtest_flag_saver_; + const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_; - // Often a user mis-spells SetUp() as Setup() and spends a long time + // Often a user misspells SetUp() as Setup() and spends a long time // wondering why it is never called by Google Test. The declaration of // the following method is solely for catching such an error at // compile time: // // - The return type is deliberately chosen to be not void, so it - // will be a conflict if a user declares void Setup() in his test - // fixture. + // will be a conflict if void Setup() is declared in the user's + // test fixture. // // - This method is private, so it will be another compiler error - // if a user calls it from his test fixture. + // if the method is called from the user's test fixture. // // DO NOT OVERRIDE THIS FUNCTION. // @@ -18059,6 +669,12 @@ class GTEST_API_ TestInfo { return NULL; } + // Returns the file name where this test is defined. + const char* file() const { return location_.file.c_str(); } + + // Returns the line where this test is defined. + int line() const { return location_.line; } + // Returns true if this test should run, that is if the test is not // disabled (or it is disabled but the also_run_disabled_tests flag has // been specified) and its full name matches the user-specified filter. @@ -18101,6 +717,7 @@ class GTEST_API_ TestInfo { const char* name, const char* type_param, const char* value_param, + internal::CodeLocation code_location, internal::TypeId fixture_class_id, Test::SetUpTestCaseFunc set_up_tc, Test::TearDownTestCaseFunc tear_down_tc, @@ -18112,6 +729,7 @@ class GTEST_API_ TestInfo { const std::string& name, const char* a_type_param, // NULL if not a type-parameterized test const char* a_value_param, // NULL if not a value-parameterized test + internal::CodeLocation a_code_location, internal::TypeId fixture_class_id, internal::TestFactoryBase* factory); @@ -18138,6 +756,7 @@ class GTEST_API_ TestInfo { // Text representation of the value parameter, or NULL if this is not a // value-parameterized test. const internal::scoped_ptr value_param_; + internal::CodeLocation location_; const internal::TypeId fixture_class_id_; // ID of the test fixture class bool should_run_; // True iff this test should run bool is_disabled_; // True iff this test is disabled @@ -18337,7 +956,7 @@ class GTEST_API_ TestCase { }; // An Environment object is capable of setting up and tearing down an -// environment. The user should subclass this to define his own +// environment. You should subclass this to define your own // environment(s). // // An Environment object does the set-up and tear-down in virtual @@ -18749,137 +1368,42 @@ GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv); namespace internal { -// FormatForComparison::Format(value) formats a -// value of type ToPrint that is an operand of a comparison assertion -// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in -// the comparison, and is used to help determine the best way to -// format the value. In particular, when the value is a C string -// (char pointer) and the other operand is an STL string object, we -// want to format the C string as a string, since we know it is -// compared by value with the string object. If the value is a char -// pointer but the other operand is not an STL string object, we don't -// know whether the pointer is supposed to point to a NUL-terminated -// string, and thus want to print it as a pointer to be safe. -// -// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. - -// The default case. -template -class FormatForComparison { - public: - static ::std::string Format(const ToPrint& value) { - return ::testing::PrintToString(value); - } -}; - -// Array. -template -class FormatForComparison { - public: - static ::std::string Format(const ToPrint* value) { - return FormatForComparison::Format(value); - } -}; - -// By default, print C string as pointers to be safe, as we don't know -// whether they actually point to a NUL-terminated string. - -#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \ - template \ - class FormatForComparison { \ - public: \ - static ::std::string Format(CharType* value) { \ - return ::testing::PrintToString(static_cast(value)); \ - } \ - } - -GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); -GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); -GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); -GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); - -#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ - -// If a C string is compared with an STL string object, we know it's meant -// to point to a NUL-terminated string, and thus can print it as a string. - -#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ - template <> \ - class FormatForComparison { \ - public: \ - static ::std::string Format(CharType* value) { \ - return ::testing::PrintToString(value); \ - } \ - } - -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); - -#if GTEST_HAS_GLOBAL_STRING -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string); -#endif - -#if GTEST_HAS_GLOBAL_WSTRING -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring); -#endif - -#if GTEST_HAS_STD_WSTRING -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); -#endif - -#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_ - -// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) -// operand to be used in a failure message. The type (but not value) -// of the other operand may affect the format. This allows us to -// print a char* as a raw pointer when it is compared against another -// char* or void*, and print it as a C string when it is compared -// against an std::string object, for example. -// -// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers +// when calling EXPECT_* in a tight loop. template -std::string FormatForComparisonFailureMessage( - const T1& value, const T2& /* other_operand */) { - return FormatForComparison::Format(value); +AssertionResult CmpHelperEQFailure(const char* lhs_expression, + const char* rhs_expression, + const T1& lhs, const T2& rhs) { + return EqFailure(lhs_expression, + rhs_expression, + FormatForComparisonFailureMessage(lhs, rhs), + FormatForComparisonFailureMessage(rhs, lhs), + false); } // The helper function for {ASSERT|EXPECT}_EQ. template -AssertionResult CmpHelperEQ(const char* expected_expression, - const char* actual_expression, - const T1& expected, - const T2& actual) { -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4389) // Temporarily disables warning on - // signed/unsigned mismatch. -#endif - - if (expected == actual) { +AssertionResult CmpHelperEQ(const char* lhs_expression, + const char* rhs_expression, + const T1& lhs, + const T2& rhs) { +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */) + if (lhs == rhs) { return AssertionSuccess(); } +GTEST_DISABLE_MSC_WARNINGS_POP_() -#ifdef _MSC_VER -# pragma warning(pop) // Restores the warning state. -#endif - - return EqFailure(expected_expression, - actual_expression, - FormatForComparisonFailureMessage(expected, actual), - FormatForComparisonFailureMessage(actual, expected), - false); + return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs); } // With this overloaded version, we allow anonymous enums to be used // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums // can be implicitly cast to BiggestInt. -GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression, - const char* actual_expression, - BiggestInt expected, - BiggestInt actual); +GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression, + const char* rhs_expression, + BiggestInt lhs, + BiggestInt rhs); // The helper class for {ASSERT|EXPECT}_EQ. The template argument // lhs_is_null_literal is true iff the first argument to ASSERT_EQ() @@ -18890,12 +1414,11 @@ class EqHelper { public: // This templatized version is for the general case. template - static AssertionResult Compare(const char* expected_expression, - const char* actual_expression, - const T1& expected, - const T2& actual) { - return CmpHelperEQ(expected_expression, actual_expression, expected, - actual); + static AssertionResult Compare(const char* lhs_expression, + const char* rhs_expression, + const T1& lhs, + const T2& rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } // With this overloaded version, we allow anonymous enums to be used @@ -18904,12 +1427,11 @@ class EqHelper { // // Even though its body looks the same as the above version, we // cannot merge the two, as it will make anonymous enums unhappy. - static AssertionResult Compare(const char* expected_expression, - const char* actual_expression, - BiggestInt expected, - BiggestInt actual) { - return CmpHelperEQ(expected_expression, actual_expression, expected, - actual); + static AssertionResult Compare(const char* lhs_expression, + const char* rhs_expression, + BiggestInt lhs, + BiggestInt rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } }; @@ -18924,40 +1446,52 @@ class EqHelper { // EXPECT_EQ(false, a_bool). template static AssertionResult Compare( - const char* expected_expression, - const char* actual_expression, - const T1& expected, - const T2& actual, + const char* lhs_expression, + const char* rhs_expression, + const T1& lhs, + const T2& rhs, // The following line prevents this overload from being considered if T2 // is not a pointer type. We need this because ASSERT_EQ(NULL, my_ptr) // expands to Compare("", "", NULL, my_ptr), which requires a conversion // to match the Secret* in the other overload, which would otherwise make // this template match better. typename EnableIf::value>::type* = 0) { - return CmpHelperEQ(expected_expression, actual_expression, expected, - actual); + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } // This version will be picked when the second argument to ASSERT_EQ() is a // pointer, e.g. ASSERT_EQ(NULL, a_pointer). template static AssertionResult Compare( - const char* expected_expression, - const char* actual_expression, + const char* lhs_expression, + const char* rhs_expression, // We used to have a second template parameter instead of Secret*. That // template parameter would deduce to 'long', making this a better match // than the first overload even without the first overload's EnableIf. // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to // non-pointer argument" (even a deduced integral argument), so the old // implementation caused warnings in user code. - Secret* /* expected (NULL) */, - T* actual) { - // We already know that 'expected' is a null pointer. - return CmpHelperEQ(expected_expression, actual_expression, - static_cast(NULL), actual); + Secret* /* lhs (NULL) */, + T* rhs) { + // We already know that 'lhs' is a null pointer. + return CmpHelperEQ(lhs_expression, rhs_expression, + static_cast(NULL), rhs); } }; +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers +// when calling EXPECT_OP in a tight loop. +template +AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2, + const T1& val1, const T2& val2, + const char* op) { + return AssertionFailure() + << "Expected: (" << expr1 << ") " << op << " (" << expr2 + << "), actual: " << FormatForComparisonFailureMessage(val1, val2) + << " vs " << FormatForComparisonFailureMessage(val2, val1); +} + // A macro for implementing the helper functions needed to implement // ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste // of similar code. @@ -18968,6 +1502,7 @@ class EqHelper { // with gcc 4. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + #define GTEST_IMPL_CMP_HELPER_(op_name, op)\ template \ AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ @@ -18975,10 +1510,7 @@ AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ if (val1 op val2) {\ return AssertionSuccess();\ } else {\ - return AssertionFailure() \ - << "Expected: (" << expr1 << ") " #op " (" << expr2\ - << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ - << " vs " << FormatForComparisonFailureMessage(val2, val1);\ + return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\ }\ }\ GTEST_API_ AssertionResult CmpHelper##op_name(\ @@ -19002,18 +1534,18 @@ GTEST_IMPL_CMP_HELPER_(GT, >); // The helper function for {ASSERT|EXPECT}_STREQ. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression, - const char* actual_expression, - const char* expected, - const char* actual); +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); // The helper function for {ASSERT|EXPECT}_STRCASEEQ. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression, - const char* actual_expression, - const char* expected, - const char* actual); +GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); // The helper function for {ASSERT|EXPECT}_STRNE. // @@ -19035,10 +1567,10 @@ GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression, // Helper function for *_STREQ on wide strings. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression, - const char* actual_expression, - const wchar_t* expected, - const wchar_t* actual); +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, + const wchar_t* s2); // Helper function for *_STRNE on wide strings. // @@ -19096,28 +1628,28 @@ namespace internal { // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. template -AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression, - const char* actual_expression, - RawType expected, - RawType actual) { - const FloatingPoint lhs(expected), rhs(actual); +AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, + const char* rhs_expression, + RawType lhs_value, + RawType rhs_value) { + const FloatingPoint lhs(lhs_value), rhs(rhs_value); if (lhs.AlmostEquals(rhs)) { return AssertionSuccess(); } - ::std::stringstream expected_ss; - expected_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << expected; + ::std::stringstream lhs_ss; + lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << lhs_value; - ::std::stringstream actual_ss; - actual_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << actual; + ::std::stringstream rhs_ss; + rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << rhs_value; - return EqFailure(expected_expression, - actual_expression, - StringStreamToString(&expected_ss), - StringStreamToString(&actual_ss), + return EqFailure(lhs_expression, + rhs_expression, + StringStreamToString(&lhs_ss), + StringStreamToString(&rhs_ss), false); } @@ -19325,13 +1857,13 @@ class TestWithParam : public Test, public WithParamInterface { // AssertionResult. For more information on how to use AssertionResult with // these macros see comments on that class. #define EXPECT_TRUE(condition) \ - GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \ GTEST_NONFATAL_FAILURE_) #define EXPECT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_NONFATAL_FAILURE_) #define ASSERT_TRUE(condition) \ - GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \ GTEST_FATAL_FAILURE_) #define ASSERT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ @@ -19339,373 +1871,16 @@ class TestWithParam : public Test, public WithParamInterface { // Includes the auto-generated header that implements a family of // generic predicate assertion macros. -// Copyright 2006, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command -// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! -// -// Implements a family of generic predicate assertion macros. - -#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ - -// Makes sure this header is not included before gtest.h. -#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ -# error Do not include gtest_pred_impl.h directly. Include gtest.h instead. -#endif // GTEST_INCLUDE_GTEST_GTEST_H_ - -// This header implements a family of generic predicate assertion -// macros: -// -// ASSERT_PRED_FORMAT1(pred_format, v1) -// ASSERT_PRED_FORMAT2(pred_format, v1, v2) -// ... -// -// where pred_format is a function or functor that takes n (in the -// case of ASSERT_PRED_FORMATn) values and their source expression -// text, and returns a testing::AssertionResult. See the definition -// of ASSERT_EQ in gtest.h for an example. -// -// If you don't care about formatting, you can use the more -// restrictive version: -// -// ASSERT_PRED1(pred, v1) -// ASSERT_PRED2(pred, v1, v2) -// ... -// -// where pred is an n-ary function or functor that returns bool, -// and the values v1, v2, ..., must support the << operator for -// streaming to std::ostream. -// -// We also define the EXPECT_* variations. -// -// For now we only support predicates whose arity is at most 5. -// Please email googletestframework@googlegroups.com if you need -// support for higher arities. - -// GTEST_ASSERT_ is the basic statement to which all of the assertions -// in this file reduce. Don't use this in your code. - -#define GTEST_ASSERT_(expression, on_failure) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (const ::testing::AssertionResult gtest_ar = (expression)) \ - ; \ - else \ - on_failure(gtest_ar.failure_message()) - - -// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use -// this in your code. -template -AssertionResult AssertPred1Helper(const char* pred_text, - const char* e1, - Pred pred, - const T1& v1) { - if (pred(v1)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. -// Don't use this in your code. -#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, v1), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use -// this in your code. -#define GTEST_PRED1_(pred, v1, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \ - #v1, \ - pred, \ - v1), on_failure) - -// Unary predicate assertion macros. -#define EXPECT_PRED_FORMAT1(pred_format, v1) \ - GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED1(pred, v1) \ - GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT1(pred_format, v1) \ - GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED1(pred, v1) \ - GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) - - - -// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use -// this in your code. -template -AssertionResult AssertPred2Helper(const char* pred_text, - const char* e1, - const char* e2, - Pred pred, - const T1& v1, - const T2& v2) { - if (pred(v1, v2)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. -// Don't use this in your code. -#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use -// this in your code. -#define GTEST_PRED2_(pred, v1, v2, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \ - #v1, \ - #v2, \ - pred, \ - v1, \ - v2), on_failure) - -// Binary predicate assertion macros. -#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ - GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED2(pred, v1, v2) \ - GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ - GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED2(pred, v1, v2) \ - GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) - - - -// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use -// this in your code. -template -AssertionResult AssertPred3Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3) { - if (pred(v1, v2, v3)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. -// Don't use this in your code. -#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use -// this in your code. -#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - pred, \ - v1, \ - v2, \ - v3), on_failure) - -// Ternary predicate assertion macros. -#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ - GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED3(pred, v1, v2, v3) \ - GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ - GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED3(pred, v1, v2, v3) \ - GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) - - - -// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use -// this in your code. -template -AssertionResult AssertPred4Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - const char* e4, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3, - const T4& v4) { - if (pred(v1, v2, v3, v4)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ", " - << e4 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3 - << "\n" << e4 << " evaluates to " << v4; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. -// Don't use this in your code. -#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use -// this in your code. -#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - #v4, \ - pred, \ - v1, \ - v2, \ - v3, \ - v4), on_failure) - -// 4-ary predicate assertion macros. -#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ - GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ - GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ - GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ - GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) - - - -// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use -// this in your code. -template -AssertionResult AssertPred5Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - const char* e4, - const char* e5, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3, - const T4& v4, - const T5& v5) { - if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ", " - << e4 << ", " - << e5 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3 - << "\n" << e4 << " evaluates to " << v4 - << "\n" << e5 << " evaluates to " << v5; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. -// Don't use this in your code. -#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use -// this in your code. -#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - #v4, \ - #v5, \ - pred, \ - v1, \ - v2, \ - v3, \ - v4, \ - v5), on_failure) - -// 5-ary predicate assertion macros. -#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ - GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ - GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ - GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ - GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) - - - -#endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#include "gtest/gtest_pred_impl.h" // Macros for testing equalities and inequalities. // -// * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual -// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 -// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 -// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 -// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 -// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 +// * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2 +// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 +// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 +// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 +// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 +// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 // // When they are not, Google Test prints both the tested expressions and // their actual values. The values must be compatible built-in types, @@ -19727,8 +1902,8 @@ AssertionResult AssertPred5Helper(const char* pred_text, // are related, not how their content is related. To compare two C // strings by content, use {ASSERT|EXPECT}_STR*(). // -// 3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to -// {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you +// 3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to +// {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you // what the actual value is when it fails, and similarly for the // other comparisons. // @@ -19744,12 +1919,12 @@ AssertionResult AssertPred5Helper(const char* pred_text, // ASSERT_LT(i, array_size); // ASSERT_GT(records.size(), 0) << "There is no record left."; -#define EXPECT_EQ(expected, actual) \ +#define EXPECT_EQ(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal:: \ - EqHelper::Compare, \ - expected, actual) -#define EXPECT_NE(expected, actual) \ - EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual) + EqHelper::Compare, \ + val1, val2) +#define EXPECT_NE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) #define EXPECT_LE(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) #define EXPECT_LT(val1, val2) \ @@ -19759,10 +1934,10 @@ AssertionResult AssertPred5Helper(const char* pred_text, #define EXPECT_GT(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) -#define GTEST_ASSERT_EQ(expected, actual) \ +#define GTEST_ASSERT_EQ(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal:: \ - EqHelper::Compare, \ - expected, actual) + EqHelper::Compare, \ + val1, val2) #define GTEST_ASSERT_NE(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) #define GTEST_ASSERT_LE(val1, val2) \ @@ -19817,29 +1992,29 @@ AssertionResult AssertPred5Helper(const char* pred_text, // // These macros evaluate their arguments exactly once. -#define EXPECT_STREQ(expected, actual) \ - EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual) +#define EXPECT_STREQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) #define EXPECT_STRNE(s1, s2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) -#define EXPECT_STRCASEEQ(expected, actual) \ - EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual) +#define EXPECT_STRCASEEQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) #define EXPECT_STRCASENE(s1, s2)\ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) -#define ASSERT_STREQ(expected, actual) \ - ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual) +#define ASSERT_STREQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) #define ASSERT_STRNE(s1, s2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) -#define ASSERT_STRCASEEQ(expected, actual) \ - ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual) +#define ASSERT_STRCASEEQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) #define ASSERT_STRCASENE(s1, s2)\ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) // Macros for comparing floating-point numbers. // -// * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual): +// * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2): // Tests that two float values are almost equal. -// * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual): +// * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2): // Tests that two double values are almost equal. // * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error): // Tests that v1 and v2 are within the given distance to each other. @@ -19849,21 +2024,21 @@ AssertionResult AssertPred5Helper(const char* pred_text, // FloatingPoint template class in gtest-internal.h if you are // interested in the implementation details. -#define EXPECT_FLOAT_EQ(expected, actual)\ +#define EXPECT_FLOAT_EQ(val1, val2)\ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ - expected, actual) + val1, val2) -#define EXPECT_DOUBLE_EQ(expected, actual)\ +#define EXPECT_DOUBLE_EQ(val1, val2)\ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ - expected, actual) + val1, val2) -#define ASSERT_FLOAT_EQ(expected, actual)\ +#define ASSERT_FLOAT_EQ(val1, val2)\ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ - expected, actual) + val1, val2) -#define ASSERT_DOUBLE_EQ(expected, actual)\ +#define ASSERT_DOUBLE_EQ(val1, val2)\ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ - expected, actual) + val1, val2) #define EXPECT_NEAR(val1, val2, abs_error)\ EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ @@ -19985,8 +2160,8 @@ bool StaticAssertTypeEq() { // The convention is to end the test case name with "Test". For // example, a test case for the Foo class can be named FooTest. // -// The user should put his test code between braces after using this -// macro. Example: +// Test code should appear between braces after an invocation of +// this macro. Example: // // TEST(FooTest, InitializesCorrectly) { // Foo foo; diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h new file mode 100644 index 0000000000..30ae712f50 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h @@ -0,0 +1,358 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command +// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! +// +// Implements a family of generic predicate assertion macros. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +// Makes sure this header is not included before gtest.h. +#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ +# error Do not include gtest_pred_impl.h directly. Include gtest.h instead. +#endif // GTEST_INCLUDE_GTEST_GTEST_H_ + +// This header implements a family of generic predicate assertion +// macros: +// +// ASSERT_PRED_FORMAT1(pred_format, v1) +// ASSERT_PRED_FORMAT2(pred_format, v1, v2) +// ... +// +// where pred_format is a function or functor that takes n (in the +// case of ASSERT_PRED_FORMATn) values and their source expression +// text, and returns a testing::AssertionResult. See the definition +// of ASSERT_EQ in gtest.h for an example. +// +// If you don't care about formatting, you can use the more +// restrictive version: +// +// ASSERT_PRED1(pred, v1) +// ASSERT_PRED2(pred, v1, v2) +// ... +// +// where pred is an n-ary function or functor that returns bool, +// and the values v1, v2, ..., must support the << operator for +// streaming to std::ostream. +// +// We also define the EXPECT_* variations. +// +// For now we only support predicates whose arity is at most 5. +// Please email googletestframework@googlegroups.com if you need +// support for higher arities. + +// GTEST_ASSERT_ is the basic statement to which all of the assertions +// in this file reduce. Don't use this in your code. + +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar = (expression)) \ + ; \ + else \ + on_failure(gtest_ar.failure_message()) + + +// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +template +AssertionResult AssertPred1Helper(const char* pred_text, + const char* e1, + Pred pred, + const T1& v1) { + if (pred(v1)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. +// Don't use this in your code. +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, v1), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +#define GTEST_PRED1_(pred, v1, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \ + #v1, \ + pred, \ + v1), on_failure) + +// Unary predicate assertion macros. +#define EXPECT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) \ + GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED1(pred, v1) \ + GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +template +AssertionResult AssertPred2Helper(const char* pred_text, + const char* e1, + const char* e2, + Pred pred, + const T1& v1, + const T2& v2) { + if (pred(v1, v2)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. +// Don't use this in your code. +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +#define GTEST_PRED2_(pred, v1, v2, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \ + #v1, \ + #v2, \ + pred, \ + v1, \ + v2), on_failure) + +// Binary predicate assertion macros. +#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +template +AssertionResult AssertPred3Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3) { + if (pred(v1, v2, v3)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. +// Don't use this in your code. +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + pred, \ + v1, \ + v2, \ + v3), on_failure) + +// Ternary predicate assertion macros. +#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +template +AssertionResult AssertPred4Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + const char* e4, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3, + const T4& v4) { + if (pred(v1, v2, v3, v4)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ", " + << e4 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3 + << "\n" << e4 << " evaluates to " << v4; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. +// Don't use this in your code. +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + #v4, \ + pred, \ + v1, \ + v2, \ + v3, \ + v4), on_failure) + +// 4-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +template +AssertionResult AssertPred5Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + const char* e4, + const char* e5, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3, + const T4& v4, + const T5& v5) { + if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ", " + << e4 << ", " + << e5 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3 + << "\n" << e4 << " evaluates to " << v4 + << "\n" << e5 << " evaluates to " << v5; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. +// Don't use this in your code. +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + #v4, \ + #v5, \ + pred, \ + v1, \ + v2, \ + v3, \ + v4, \ + v5), on_failure) + +// 5-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) + + + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h new file mode 100644 index 0000000000..da80ddc6c7 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h @@ -0,0 +1,58 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// Google C++ Testing Framework definitions useful in production code. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void MyMethod(); +// FRIEND_TEST(MyClassTest, MyMethod); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, MyMethod) { +// // Can call MyClass::MyMethod() here. +// } + +#define FRIEND_TEST(test_case_name, test_name)\ +friend class test_case_name##_##test_name##_Test + +#endif // GTEST_INCLUDE_GTEST_GTEST_PROD_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h new file mode 100644 index 0000000000..7e744bd3bb --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h @@ -0,0 +1,69 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. +// The following macros can be defined: +// +// Flag related macros: +// GTEST_FLAG(flag_name) +// GTEST_USE_OWN_FLAGFILE_FLAG_ - Define to 0 when the system provides its +// own flagfile flag parsing. +// GTEST_DECLARE_bool_(name) +// GTEST_DECLARE_int32_(name) +// GTEST_DECLARE_string_(name) +// GTEST_DEFINE_bool_(name, default_val, doc) +// GTEST_DEFINE_int32_(name, default_val, doc) +// GTEST_DEFINE_string_(name, default_val, doc) +// +// Test filtering: +// GTEST_TEST_FILTER_ENV_VAR_ - The name of an environment variable that +// will be used if --GTEST_FLAG(test_filter) +// is not provided. +// +// Logging: +// GTEST_LOG_(severity) +// GTEST_CHECK_(condition) +// Functions LogToStderr() and FlushInfoLog() have to be provided too. +// +// Threading: +// GTEST_HAS_NOTIFICATION_ - Enabled if Notification is already provided. +// GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ - Enabled if Mutex and ThreadLocal are +// already provided. +// Must also provide GTEST_DECLARE_STATIC_MUTEX_(mutex) and +// GTEST_DEFINE_STATIC_MUTEX_(mutex) +// +// GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +// GTEST_LOCK_EXCLUDED_(locks) +// +// ** Custom implementation starts here ** + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h new file mode 100644 index 0000000000..60c1ea050b --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h @@ -0,0 +1,42 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// This file provides an injection point for custom printers in a local +// installation of gTest. +// It will be included from gtest-printers.h and the overrides in this file +// will be visible to everyone. +// See documentation at gtest/gtest-printers.h for details on how to define a +// custom printer. +// +// ** Custom implementation starts here ** + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h new file mode 100644 index 0000000000..c27412a898 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h @@ -0,0 +1,41 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. +// The following macros can be defined: +// +// GTEST_OS_STACK_TRACE_GETTER_ - The name of an implementation of +// OsStackTraceGetterInterface. +// +// ** Custom implementation starts here ** + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h new file mode 100644 index 0000000000..2b3a78f5bf --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h @@ -0,0 +1,319 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines internal utilities needed for implementing +// death tests. They are subject to change without notice. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + +#include "gtest/internal/gtest-internal.h" + +#include + +namespace testing { +namespace internal { + +GTEST_DECLARE_string_(internal_run_death_test); + +// Names of the flags (needed for parsing Google Test flags). +const char kDeathTestStyleFlag[] = "death_test_style"; +const char kDeathTestUseFork[] = "death_test_use_fork"; +const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; + +#if GTEST_HAS_DEATH_TEST + +// DeathTest is a class that hides much of the complexity of the +// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method +// returns a concrete class that depends on the prevailing death test +// style, as defined by the --gtest_death_test_style and/or +// --gtest_internal_run_death_test flags. + +// In describing the results of death tests, these terms are used with +// the corresponding definitions: +// +// exit status: The integer exit information in the format specified +// by wait(2) +// exit code: The integer code passed to exit(3), _exit(2), or +// returned from main() +class GTEST_API_ DeathTest { + public: + // Create returns false if there was an error determining the + // appropriate action to take for the current death test; for example, + // if the gtest_death_test_style flag is set to an invalid value. + // The LastMessage method will return a more detailed message in that + // case. Otherwise, the DeathTest pointer pointed to by the "test" + // argument is set. If the death test should be skipped, the pointer + // is set to NULL; otherwise, it is set to the address of a new concrete + // DeathTest object that controls the execution of the current test. + static bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test); + DeathTest(); + virtual ~DeathTest() { } + + // A helper class that aborts a death test when it's deleted. + class ReturnSentinel { + public: + explicit ReturnSentinel(DeathTest* test) : test_(test) { } + ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + private: + DeathTest* const test_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); + } GTEST_ATTRIBUTE_UNUSED_; + + // An enumeration of possible roles that may be taken when a death + // test is encountered. EXECUTE means that the death test logic should + // be executed immediately. OVERSEE means that the program should prepare + // the appropriate environment for a child process to execute the death + // test, then wait for it to complete. + enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; + + // An enumeration of the three reasons that a test might be aborted. + enum AbortReason { + TEST_ENCOUNTERED_RETURN_STATEMENT, + TEST_THREW_EXCEPTION, + TEST_DID_NOT_DIE + }; + + // Assumes one of the above roles. + virtual TestRole AssumeRole() = 0; + + // Waits for the death test to finish and returns its status. + virtual int Wait() = 0; + + // Returns true if the death test passed; that is, the test process + // exited during the test, its exit status matches a user-supplied + // predicate, and its stderr output matches a user-supplied regular + // expression. + // The user-supplied predicate may be a macro expression rather + // than a function pointer or functor, or else Wait and Passed could + // be combined. + virtual bool Passed(bool exit_status_ok) = 0; + + // Signals that the death test did not die as expected. + virtual void Abort(AbortReason reason) = 0; + + // Returns a human-readable outcome message regarding the outcome of + // the last death test. + static const char* LastMessage(); + + static void set_last_death_test_message(const std::string& message); + + private: + // A string containing a description of the outcome of the last death test. + static std::string last_death_test_message_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); +}; + +// Factory interface for death tests. May be mocked out for testing. +class DeathTestFactory { + public: + virtual ~DeathTestFactory() { } + virtual bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test) = 0; +}; + +// A concrete DeathTestFactory implementation for normal use. +class DefaultDeathTestFactory : public DeathTestFactory { + public: + virtual bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test); +}; + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +GTEST_API_ bool ExitedUnsuccessfully(int exit_status); + +// Traps C++ exceptions escaping statement and reports them as test +// failures. Note that trapping SEH exceptions is not implemented here. +# if GTEST_HAS_EXCEPTIONS +# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception& gtest_exception) { \ + fprintf(\ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ + ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ + gtest_exception.what()); \ + fflush(stderr); \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } catch (...) { \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } + +# else +# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) + +# endif + +// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, +// ASSERT_EXIT*, and EXPECT_EXIT*. +# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + const ::testing::internal::RE& gtest_regex = (regex); \ + ::testing::internal::DeathTest* gtest_dt; \ + if (!::testing::internal::DeathTest::Create(#statement, >est_regex, \ + __FILE__, __LINE__, >est_dt)) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + if (gtest_dt != NULL) { \ + ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \ + gtest_dt_ptr(gtest_dt); \ + switch (gtest_dt->AssumeRole()) { \ + case ::testing::internal::DeathTest::OVERSEE_TEST: \ + if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + break; \ + case ::testing::internal::DeathTest::EXECUTE_TEST: { \ + ::testing::internal::DeathTest::ReturnSentinel \ + gtest_sentinel(gtest_dt); \ + GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ + gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ + break; \ + } \ + default: \ + break; \ + } \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \ + fail(::testing::internal::DeathTest::LastMessage()) +// The symbol "fail" here expands to something into which a message +// can be streamed. + +// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in +// NDEBUG mode. In this case we need the statements to be executed, the regex is +// ignored, and the macro must accept a streamed message even though the message +// is never printed. +# define GTEST_EXECUTE_STATEMENT_(statement, regex) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } else \ + ::testing::Message() + +// A class representing the parsed contents of the +// --gtest_internal_run_death_test flag, as it existed when +// RUN_ALL_TESTS was called. +class InternalRunDeathTestFlag { + public: + InternalRunDeathTestFlag(const std::string& a_file, + int a_line, + int an_index, + int a_write_fd) + : file_(a_file), line_(a_line), index_(an_index), + write_fd_(a_write_fd) {} + + ~InternalRunDeathTestFlag() { + if (write_fd_ >= 0) + posix::Close(write_fd_); + } + + const std::string& file() const { return file_; } + int line() const { return line_; } + int index() const { return index_; } + int write_fd() const { return write_fd_; } + + private: + std::string file_; + int line_; + int index_; + int write_fd_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); +}; + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); + +#else // GTEST_HAS_DEATH_TEST + +// This macro is used for implementing macros such as +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where +// death tests are not supported. Those macros must compile on such systems +// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on +// systems that support death tests. This allows one to write such a macro +// on a system that does not support death tests and be sure that it will +// compile on a death-test supporting system. +// +// Parameters: +// statement - A statement that a macro such as EXPECT_DEATH would test +// for program termination. This macro has to make sure this +// statement is compiled but not executed, to ensure that +// EXPECT_DEATH_IF_SUPPORTED compiles with a certain +// parameter iff EXPECT_DEATH compiles with it. +// regex - A regex that a macro such as EXPECT_DEATH would use to test +// the output of statement. This parameter has to be +// compiled but not evaluated by this macro, to ensure that +// this macro only accepts expressions that a macro such as +// EXPECT_DEATH would accept. +// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED +// and a return statement for ASSERT_DEATH_IF_SUPPORTED. +// This ensures that ASSERT_DEATH_IF_SUPPORTED will not +// compile inside functions where ASSERT_DEATH doesn't +// compile. +// +// The branch that has an always false condition is used to ensure that +// statement and regex are compiled (and thus syntactically correct) but +// never executed. The unreachable code macro protects the terminator +// statement from generating an 'unreachable code' warning in case +// statement unconditionally returns or throws. The Message constructor at +// the end allows the syntax of streaming additional messages into the +// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. +# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) \ + << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h new file mode 100644 index 0000000000..7a13b4b0de --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h @@ -0,0 +1,206 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: keith.ray@gmail.com (Keith Ray) +// +// Google Test filepath utilities +// +// This header file declares classes and functions used internally by +// Google Test. They are subject to change without notice. +// +// This file is #included in . +// Do not include this header file separately! + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ + +#include "gtest/internal/gtest-string.h" + +namespace testing { +namespace internal { + +// FilePath - a class for file and directory pathname manipulation which +// handles platform-specific conventions (like the pathname separator). +// Used for helper functions for naming files in a directory for xml output. +// Except for Set methods, all methods are const or static, which provides an +// "immutable value object" -- useful for peace of mind. +// A FilePath with a value ending in a path separator ("like/this/") represents +// a directory, otherwise it is assumed to represent a file. In either case, +// it may or may not represent an actual file or directory in the file system. +// Names are NOT checked for syntax correctness -- no checking for illegal +// characters, malformed paths, etc. + +class GTEST_API_ FilePath { + public: + FilePath() : pathname_("") { } + FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { } + + explicit FilePath(const std::string& pathname) : pathname_(pathname) { + Normalize(); + } + + FilePath& operator=(const FilePath& rhs) { + Set(rhs); + return *this; + } + + void Set(const FilePath& rhs) { + pathname_ = rhs.pathname_; + } + + const std::string& string() const { return pathname_; } + const char* c_str() const { return pathname_.c_str(); } + + // Returns the current working directory, or "" if unsuccessful. + static FilePath GetCurrentDir(); + + // Given directory = "dir", base_name = "test", number = 0, + // extension = "xml", returns "dir/test.xml". If number is greater + // than zero (e.g., 12), returns "dir/test_12.xml". + // On Windows platform, uses \ as the separator rather than /. + static FilePath MakeFileName(const FilePath& directory, + const FilePath& base_name, + int number, + const char* extension); + + // Given directory = "dir", relative_path = "test.xml", + // returns "dir/test.xml". + // On Windows, uses \ as the separator rather than /. + static FilePath ConcatPaths(const FilePath& directory, + const FilePath& relative_path); + + // Returns a pathname for a file that does not currently exist. The pathname + // will be directory/base_name.extension or + // directory/base_name_.extension if directory/base_name.extension + // already exists. The number will be incremented until a pathname is found + // that does not already exist. + // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. + // There could be a race condition if two or more processes are calling this + // function at the same time -- they could both pick the same filename. + static FilePath GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension); + + // Returns true iff the path is "". + bool IsEmpty() const { return pathname_.empty(); } + + // If input name has a trailing separator character, removes it and returns + // the name, otherwise return the name string unmodified. + // On Windows platform, uses \ as the separator, other platforms use /. + FilePath RemoveTrailingPathSeparator() const; + + // Returns a copy of the FilePath with the directory part removed. + // Example: FilePath("path/to/file").RemoveDirectoryName() returns + // FilePath("file"). If there is no directory part ("just_a_file"), it returns + // the FilePath unmodified. If there is no file part ("just_a_dir/") it + // returns an empty FilePath (""). + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveDirectoryName() const; + + // RemoveFileName returns the directory path with the filename removed. + // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". + // If the FilePath is "a_file" or "/a_file", RemoveFileName returns + // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does + // not have a file, like "just/a/dir/", it returns the FilePath unmodified. + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveFileName() const; + + // Returns a copy of the FilePath with the case-insensitive extension removed. + // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns + // FilePath("dir/file"). If a case-insensitive extension is not + // found, returns a copy of the original FilePath. + FilePath RemoveExtension(const char* extension) const; + + // Creates directories so that path exists. Returns true if successful or if + // the directories already exist; returns false if unable to create + // directories for any reason. Will also return false if the FilePath does + // not represent a directory (that is, it doesn't end with a path separator). + bool CreateDirectoriesRecursively() const; + + // Create the directory so that path exists. Returns true if successful or + // if the directory already exists; returns false if unable to create the + // directory for any reason, including if the parent directory does not + // exist. Not named "CreateDirectory" because that's a macro on Windows. + bool CreateFolder() const; + + // Returns true if FilePath describes something in the file-system, + // either a file, directory, or whatever, and that something exists. + bool FileOrDirectoryExists() const; + + // Returns true if pathname describes a directory in the file-system + // that exists. + bool DirectoryExists() const; + + // Returns true if FilePath ends with a path separator, which indicates that + // it is intended to represent a directory. Returns false otherwise. + // This does NOT check that a directory (or file) actually exists. + bool IsDirectory() const; + + // Returns true if pathname describes a root directory. (Windows has one + // root directory per disk drive.) + bool IsRootDirectory() const; + + // Returns true if pathname describes an absolute path. + bool IsAbsolutePath() const; + + private: + // Replaces multiple consecutive separators with a single separator. + // For example, "bar///foo" becomes "bar/foo". Does not eliminate other + // redundancies that might be in a pathname involving "." or "..". + // + // A pathname with multiple consecutive separators may occur either through + // user error or as a result of some scripts or APIs that generate a pathname + // with a trailing separator. On other platforms the same API or script + // may NOT generate a pathname with a trailing "/". Then elsewhere that + // pathname may have another "/" and pathname components added to it, + // without checking for the separator already being there. + // The script language and operating system may allow paths like "foo//bar" + // but some of the functions in FilePath will not handle that correctly. In + // particular, RemoveTrailingPathSeparator() only removes one separator, and + // it is called in CreateDirectoriesRecursively() assuming that it will change + // a pathname from directory syntax (trailing separator) to filename syntax. + // + // On Windows this method also replaces the alternate path separator '/' with + // the primary path separator '\\', so that for example "bar\\/\\foo" becomes + // "bar\\foo". + + void Normalize(); + + // Returns a pointer to the last occurence of a valid path separator in + // the FilePath. On Windows, for example, both '/' and '\' are valid path + // separators. Returns NULL if no path separator was found. + const char* FindLastPathSeparator() const; + + std::string pathname_; +}; // class FilePath + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h new file mode 100644 index 0000000000..ebd1cf615d --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -0,0 +1,1238 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file declares functions and macros used internally by +// Google Test. They are subject to change without notice. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ + +#include "gtest/internal/gtest-port.h" + +#if GTEST_OS_LINUX +# include +# include +# include +# include +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-string.h" +#include "gtest/internal/gtest-filepath.h" +#include "gtest/internal/gtest-type-util.h" + +// Due to C++ preprocessor weirdness, we need double indirection to +// concatenate two tokens when one of them is __LINE__. Writing +// +// foo ## __LINE__ +// +// will result in the token foo__LINE__, instead of foo followed by +// the current line number. For more details, see +// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 +#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar + +class ProtocolMessage; +namespace proto2 { class Message; } + +namespace testing { + +// Forward declarations. + +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test cases. + +template +::std::string PrintToString(const T& value); + +namespace internal { + +struct TraceInfo; // Information about a trace point. +class ScopedTrace; // Implements scoped trace. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest + +// The text used in failure messages to indicate the start of the +// stack trace. +GTEST_API_ extern const char kStackTraceMarker[]; + +// Two overloaded helpers for checking at compile time whether an +// expression is a null pointer literal (i.e. NULL or any 0-valued +// compile-time integral constant). Their return values have +// different sizes, so we can use sizeof() to test which version is +// picked by the compiler. These helpers have no implementations, as +// we only need their signatures. +// +// Given IsNullLiteralHelper(x), the compiler will pick the first +// version if x can be implicitly converted to Secret*, and pick the +// second version otherwise. Since Secret is a secret and incomplete +// type, the only expression a user can write that has type Secret* is +// a null pointer literal. Therefore, we know that x is a null +// pointer literal if and only if the first version is picked by the +// compiler. +char IsNullLiteralHelper(Secret* p); +char (&IsNullLiteralHelper(...))[2]; // NOLINT + +// A compile-time bool constant that is true if and only if x is a +// null pointer literal (i.e. NULL or any 0-valued compile-time +// integral constant). +#ifdef GTEST_ELLIPSIS_NEEDS_POD_ +// We lose support for NULL detection where the compiler doesn't like +// passing non-POD classes through ellipsis (...). +# define GTEST_IS_NULL_LITERAL_(x) false +#else +# define GTEST_IS_NULL_LITERAL_(x) \ + (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1) +#endif // GTEST_ELLIPSIS_NEEDS_POD_ + +// Appends the user-supplied message to the Google-Test-generated message. +GTEST_API_ std::string AppendUserMessage( + const std::string& gtest_msg, const Message& user_msg); + +#if GTEST_HAS_EXCEPTIONS + +// This exception is thrown by (and only by) a failed Google Test +// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions +// are enabled). We derive it from std::runtime_error, which is for +// errors presumably detectable only at run time. Since +// std::runtime_error inherits from std::exception, many testing +// frameworks know how to extract and print the message inside it. +class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { + public: + explicit GoogleTestFailureException(const TestPartResult& failure); +}; + +#endif // GTEST_HAS_EXCEPTIONS + +// A helper class for creating scoped traces in user programs. +class GTEST_API_ ScopedTrace { + public: + // The c'tor pushes the given source file location and message onto + // a trace stack maintained by Google Test. + ScopedTrace(const char* file, int line, const Message& message); + + // The d'tor pops the info pushed by the c'tor. + // + // Note that the d'tor is not virtual in order to be efficient. + // Don't inherit from ScopedTrace! + ~ScopedTrace(); + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); +} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its + // c'tor and d'tor. Therefore it doesn't + // need to be used otherwise. + +namespace edit_distance { +// Returns the optimal edits to go from 'left' to 'right'. +// All edits cost the same, with replace having lower priority than +// add/remove. +// Simple implementation of the Wagner–Fischer algorithm. +// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm +enum EditType { kMatch, kAdd, kRemove, kReplace }; +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector& left, const std::vector& right); + +// Same as above, but the input is represented as strings. +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector& left, + const std::vector& right); + +// Create a diff of the input strings in Unified diff format. +GTEST_API_ std::string CreateUnifiedDiff(const std::vector& left, + const std::vector& right, + size_t context = 2); + +} // namespace edit_distance + +// Calculate the diff between 'left' and 'right' and return it in unified diff +// format. +// If not null, stores in 'total_line_count' the total number of lines found +// in left + right. +GTEST_API_ std::string DiffStrings(const std::string& left, + const std::string& right, + size_t* total_line_count); + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true iff the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +GTEST_API_ AssertionResult EqFailure(const char* expected_expression, + const char* actual_expression, + const std::string& expected_value, + const std::string& actual_value, + bool ignoring_case); + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +GTEST_API_ std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, + const char* expression_text, + const char* actual_predicate_value, + const char* expected_predicate_value); + +// This template class represents an IEEE floating-point number +// (either single-precision or double-precision, depending on the +// template parameters). +// +// The purpose of this class is to do more sophisticated number +// comparison. (Due to round-off error, etc, it's very unlikely that +// two floating-points will be equal exactly. Hence a naive +// comparison by the == operation often doesn't work.) +// +// Format of IEEE floating-point: +// +// The most-significant bit being the leftmost, an IEEE +// floating-point looks like +// +// sign_bit exponent_bits fraction_bits +// +// Here, sign_bit is a single bit that designates the sign of the +// number. +// +// For float, there are 8 exponent bits and 23 fraction bits. +// +// For double, there are 11 exponent bits and 52 fraction bits. +// +// More details can be found at +// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +template +class FloatingPoint { + public: + // Defines the unsigned integer type that has the same size as the + // floating point number. + typedef typename TypeWithSize::UInt Bits; + + // Constants. + + // # of bits in a number. + static const size_t kBitCount = 8*sizeof(RawType); + + // # of fraction bits in a number. + static const size_t kFractionBitCount = + std::numeric_limits::digits - 1; + + // # of exponent bits in a number. + static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; + + // The mask for the sign bit. + static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); + + // The mask for the fraction bits. + static const Bits kFractionBitMask = + ~static_cast(0) >> (kExponentBitCount + 1); + + // The mask for the exponent bits. + static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); + + // How many ULP's (Units in the Last Place) we want to tolerate when + // comparing two numbers. The larger the value, the more error we + // allow. A 0 value means that two numbers must be exactly the same + // to be considered equal. + // + // The maximum error of a single floating-point operation is 0.5 + // units in the last place. On Intel CPU's, all floating-point + // calculations are done with 80-bit precision, while double has 64 + // bits. Therefore, 4 should be enough for ordinary use. + // + // See the following article for more details on ULP: + // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ + static const size_t kMaxUlps = 4; + + // Constructs a FloatingPoint from a raw floating-point number. + // + // On an Intel CPU, passing a non-normalized NAN (Not a Number) + // around may change its bits, although the new value is guaranteed + // to be also a NAN. Therefore, don't expect this constructor to + // preserve the bits in x when x is a NAN. + explicit FloatingPoint(const RawType& x) { u_.value_ = x; } + + // Static methods + + // Reinterprets a bit pattern as a floating-point number. + // + // This function is needed to test the AlmostEquals() method. + static RawType ReinterpretBits(const Bits bits) { + FloatingPoint fp(0); + fp.u_.bits_ = bits; + return fp.u_.value_; + } + + // Returns the floating-point number that represent positive infinity. + static RawType Infinity() { + return ReinterpretBits(kExponentBitMask); + } + + // Returns the maximum representable finite floating-point number. + static RawType Max(); + + // Non-static methods + + // Returns the bits that represents this number. + const Bits &bits() const { return u_.bits_; } + + // Returns the exponent bits of this number. + Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } + + // Returns the fraction bits of this number. + Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } + + // Returns the sign bit of this number. + Bits sign_bit() const { return kSignBitMask & u_.bits_; } + + // Returns true iff this is NAN (not a number). + bool is_nan() const { + // It's a NAN if the exponent bits are all ones and the fraction + // bits are not entirely zeros. + return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); + } + + // Returns true iff this number is at most kMaxUlps ULP's away from + // rhs. In particular, this function: + // + // - returns false if either number is (or both are) NAN. + // - treats really large numbers as almost equal to infinity. + // - thinks +0.0 and -0.0 are 0 DLP's apart. + bool AlmostEquals(const FloatingPoint& rhs) const { + // The IEEE standard says that any comparison operation involving + // a NAN must return false. + if (is_nan() || rhs.is_nan()) return false; + + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) + <= kMaxUlps; + } + + private: + // The data type used to store the actual floating-point number. + union FloatingPointUnion { + RawType value_; // The raw floating-point number. + Bits bits_; // The bits that represent the number. + }; + + // Converts an integer from the sign-and-magnitude representation to + // the biased representation. More precisely, let N be 2 to the + // power of (kBitCount - 1), an integer x is represented by the + // unsigned number x + N. + // + // For instance, + // + // -N + 1 (the most negative number representable using + // sign-and-magnitude) is represented by 1; + // 0 is represented by N; and + // N - 1 (the biggest number representable using + // sign-and-magnitude) is represented by 2N - 1. + // + // Read http://en.wikipedia.org/wiki/Signed_number_representations + // for more details on signed number representations. + static Bits SignAndMagnitudeToBiased(const Bits &sam) { + if (kSignBitMask & sam) { + // sam represents a negative number. + return ~sam + 1; + } else { + // sam represents a positive number. + return kSignBitMask | sam; + } + } + + // Given two numbers in the sign-and-magnitude representation, + // returns the distance between them as an unsigned number. + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, + const Bits &sam2) { + const Bits biased1 = SignAndMagnitudeToBiased(sam1); + const Bits biased2 = SignAndMagnitudeToBiased(sam2); + return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); + } + + FloatingPointUnion u_; +}; + +// We cannot use std::numeric_limits::max() as it clashes with the max() +// macro defined by . +template <> +inline float FloatingPoint::Max() { return FLT_MAX; } +template <> +inline double FloatingPoint::Max() { return DBL_MAX; } + +// Typedefs the instances of the FloatingPoint template class that we +// care to use. +typedef FloatingPoint Float; +typedef FloatingPoint Double; + +// In order to catch the mistake of putting tests that use different +// test fixture classes in the same test case, we need to assign +// unique IDs to fixture classes and compare them. The TypeId type is +// used to hold such IDs. The user should treat TypeId as an opaque +// type: the only operation allowed on TypeId values is to compare +// them for equality using the == operator. +typedef const void* TypeId; + +template +class TypeIdHelper { + public: + // dummy_ must not have a const type. Otherwise an overly eager + // compiler (e.g. MSVC 7.1 & 8.0) may try to merge + // TypeIdHelper::dummy_ for different Ts as an "optimization". + static bool dummy_; +}; + +template +bool TypeIdHelper::dummy_ = false; + +// GetTypeId() returns the ID of type T. Different values will be +// returned for different types. Calling the function twice with the +// same type argument is guaranteed to return the same ID. +template +TypeId GetTypeId() { + // The compiler is required to allocate a different + // TypeIdHelper::dummy_ variable for each T used to instantiate + // the template. Therefore, the address of dummy_ is guaranteed to + // be unique. + return &(TypeIdHelper::dummy_); +} + +// Returns the type ID of ::testing::Test. Always call this instead +// of GetTypeId< ::testing::Test>() to get the type ID of +// ::testing::Test, as the latter may give the wrong result due to a +// suspected linker bug when compiling Google Test as a Mac OS X +// framework. +GTEST_API_ TypeId GetTestTypeId(); + +// Defines the abstract factory interface that creates instances +// of a Test object. +class TestFactoryBase { + public: + virtual ~TestFactoryBase() {} + + // Creates a test instance to run. The instance is both created and destroyed + // within TestInfoImpl::Run() + virtual Test* CreateTest() = 0; + + protected: + TestFactoryBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); +}; + +// This class provides implementation of TeastFactoryBase interface. +// It is used in TEST and TEST_F macros. +template +class TestFactoryImpl : public TestFactoryBase { + public: + virtual Test* CreateTest() { return new TestClass; } +}; + +#if GTEST_OS_WINDOWS + +// Predicate-formatters for implementing the HRESULT checking macros +// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} +// We pass a long instead of HRESULT to avoid causing an +// include dependency for the HRESULT type. +GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr, + long hr); // NOLINT +GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, + long hr); // NOLINT + +#endif // GTEST_OS_WINDOWS + +// Types of SetUpTestCase() and TearDownTestCase() functions. +typedef void (*SetUpTestCaseFunc)(); +typedef void (*TearDownTestCaseFunc)(); + +struct CodeLocation { + CodeLocation(const string& a_file, int a_line) : file(a_file), line(a_line) {} + + string file; + int line; +}; + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_case_name: name of the test case +// name: name of the test +// type_param the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param text representation of the test's value parameter, +// or NULL if this is not a type-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +GTEST_API_ TestInfo* MakeAndRegisterTestInfo( + const char* test_case_name, + const char* name, + const char* type_param, + const char* value_param, + CodeLocation code_location, + TypeId fixture_class_id, + SetUpTestCaseFunc set_up_tc, + TearDownTestCaseFunc tear_down_tc, + TestFactoryBase* factory); + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// State of the definition of a type-parameterized test case. +class GTEST_API_ TypedTestCasePState { + public: + TypedTestCasePState() : registered_(false) {} + + // Adds the given test name to defined_test_names_ and return true + // if the test case hasn't been registered; otherwise aborts the + // program. + bool AddTestName(const char* file, int line, const char* case_name, + const char* test_name) { + if (registered_) { + fprintf(stderr, "%s Test %s must be defined before " + "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n", + FormatFileLocation(file, line).c_str(), test_name, case_name); + fflush(stderr); + posix::Abort(); + } + registered_tests_.insert( + ::std::make_pair(test_name, CodeLocation(file, line))); + return true; + } + + bool TestExists(const std::string& test_name) const { + return registered_tests_.count(test_name) > 0; + } + + const CodeLocation& GetCodeLocation(const std::string& test_name) const { + RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name); + GTEST_CHECK_(it != registered_tests_.end()); + return it->second; + } + + // Verifies that registered_tests match the test names in + // defined_test_names_; returns registered_tests if successful, or + // aborts the program otherwise. + const char* VerifyRegisteredTestNames( + const char* file, int line, const char* registered_tests); + + private: + typedef ::std::map RegisteredTestsMap; + + bool registered_; + RegisteredTestsMap registered_tests_; +}; + +// Skips to the first non-space char after the first comma in 'str'; +// returns NULL if no comma is found in 'str'. +inline const char* SkipComma(const char* str) { + const char* comma = strchr(str, ','); + if (comma == NULL) { + return NULL; + } + while (IsSpace(*(++comma))) {} + return comma; +} + +// Returns the prefix of 'str' before the first comma in it; returns +// the entire string if it contains no comma. +inline std::string GetPrefixUntilComma(const char* str) { + const char* comma = strchr(str, ','); + return comma == NULL ? str : std::string(str, comma); +} + +// Splits a given string on a given delimiter, populating a given +// vector with the fields. +void SplitString(const ::std::string& str, char delimiter, + ::std::vector< ::std::string>* dest); + +// TypeParameterizedTest::Register() +// registers a list of type-parameterized tests with Google Test. The +// return value is insignificant - we just need to return something +// such that we can call this function in a namespace scope. +// +// Implementation note: The GTEST_TEMPLATE_ macro declares a template +// template parameter. It's defined in gtest-type-util.h. +template +class TypeParameterizedTest { + public: + // 'index' is the index of the test in the type list 'Types' + // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase, + // Types). Valid values for 'index' are [0, N - 1] where N is the + // length of Types. + static bool Register(const char* prefix, + CodeLocation code_location, + const char* case_name, const char* test_names, + int index) { + typedef typename Types::Head Type; + typedef Fixture FixtureClass; + typedef typename GTEST_BIND_(TestSel, Type) TestClass; + + // First, registers the first type-parameterized test in the type + // list. + MakeAndRegisterTestInfo( + (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/" + + StreamableToString(index)).c_str(), + StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(), + GetTypeName().c_str(), + NULL, // No value parameter. + code_location, + GetTypeId(), + TestClass::SetUpTestCase, + TestClass::TearDownTestCase, + new TestFactoryImpl); + + // Next, recurses (at compile time) with the tail of the type list. + return TypeParameterizedTest + ::Register(prefix, code_location, case_name, test_names, index + 1); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTest { + public: + static bool Register(const char* /*prefix*/, CodeLocation, + const char* /*case_name*/, const char* /*test_names*/, + int /*index*/) { + return true; + } +}; + +// TypeParameterizedTestCase::Register() +// registers *all combinations* of 'Tests' and 'Types' with Google +// Test. The return value is insignificant - we just need to return +// something such that we can call this function in a namespace scope. +template +class TypeParameterizedTestCase { + public: + static bool Register(const char* prefix, CodeLocation code_location, + const TypedTestCasePState* state, + const char* case_name, const char* test_names) { + std::string test_name = StripTrailingSpaces( + GetPrefixUntilComma(test_names)); + if (!state->TestExists(test_name)) { + fprintf(stderr, "Failed to get code location for test %s.%s at %s.", + case_name, test_name.c_str(), + FormatFileLocation(code_location.file.c_str(), + code_location.line).c_str()); + fflush(stderr); + posix::Abort(); + } + const CodeLocation& test_location = state->GetCodeLocation(test_name); + + typedef typename Tests::Head Head; + + // First, register the first test in 'Test' for each type in 'Types'. + TypeParameterizedTest::Register( + prefix, test_location, case_name, test_names, 0); + + // Next, recurses (at compile time) with the tail of the test list. + return TypeParameterizedTestCase + ::Register(prefix, code_location, state, + case_name, SkipComma(test_names)); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTestCase { + public: + static bool Register(const char* /*prefix*/, CodeLocation, + const TypedTestCasePState* /*state*/, + const char* /*case_name*/, const char* /*test_names*/) { + return true; + } +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_API_ std::string GetCurrentOsStackTraceExceptTop( + UnitTest* unit_test, int skip_count); + +// Helpers for suppressing warnings on unreachable code or constant +// condition. + +// Always returns true. +GTEST_API_ bool AlwaysTrue(); + +// Always returns false. +inline bool AlwaysFalse() { return !AlwaysTrue(); } + +// Helper for suppressing false warning from Clang on a const char* +// variable declared in a conditional expression always being NULL in +// the else branch. +struct GTEST_API_ ConstCharPtr { + ConstCharPtr(const char* str) : value(str) {} + operator bool() const { return true; } + const char* value; +}; + +// A simple Linear Congruential Generator for generating random +// numbers with a uniform distribution. Unlike rand() and srand(), it +// doesn't use global state (and therefore can't interfere with user +// code). Unlike rand_r(), it's portable. An LCG isn't very random, +// but it's good enough for our purposes. +class GTEST_API_ Random { + public: + static const UInt32 kMaxRange = 1u << 31; + + explicit Random(UInt32 seed) : state_(seed) {} + + void Reseed(UInt32 seed) { state_ = seed; } + + // Generates a random number from [0, range). Crashes if 'range' is + // 0 or greater than kMaxRange. + UInt32 Generate(UInt32 range); + + private: + UInt32 state_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); +}; + +// Defining a variable of type CompileAssertTypesEqual will cause a +// compiler error iff T1 and T2 are different types. +template +struct CompileAssertTypesEqual; + +template +struct CompileAssertTypesEqual { +}; + +// Removes the reference from a type if it is a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::remove_reference, which is not widely available yet. +template +struct RemoveReference { typedef T type; }; // NOLINT +template +struct RemoveReference { typedef T type; }; // NOLINT + +// A handy wrapper around RemoveReference that works when the argument +// T depends on template parameters. +#define GTEST_REMOVE_REFERENCE_(T) \ + typename ::testing::internal::RemoveReference::type + +// Removes const from a type if it is a const type, otherwise leaves +// it unchanged. This is the same as tr1::remove_const, which is not +// widely available yet. +template +struct RemoveConst { typedef T type; }; // NOLINT +template +struct RemoveConst { typedef T type; }; // NOLINT + +// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above +// definition to fail to remove the const in 'const int[3]' and 'const +// char[3][4]'. The following specialization works around the bug. +template +struct RemoveConst { + typedef typename RemoveConst::type type[N]; +}; + +#if defined(_MSC_VER) && _MSC_VER < 1400 +// This is the only specialization that allows VC++ 7.1 to remove const in +// 'const int[3] and 'const int[3][4]'. However, it causes trouble with GCC +// and thus needs to be conditionally compiled. +template +struct RemoveConst { + typedef typename RemoveConst::type type[N]; +}; +#endif + +// A handy wrapper around RemoveConst that works when the argument +// T depends on template parameters. +#define GTEST_REMOVE_CONST_(T) \ + typename ::testing::internal::RemoveConst::type + +// Turns const U&, U&, const U, and U all into U. +#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ + GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T)) + +// Adds reference to a type if it is not a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::add_reference, which is not widely available yet. +template +struct AddReference { typedef T& type; }; // NOLINT +template +struct AddReference { typedef T& type; }; // NOLINT + +// A handy wrapper around AddReference that works when the argument T +// depends on template parameters. +#define GTEST_ADD_REFERENCE_(T) \ + typename ::testing::internal::AddReference::type + +// Adds a reference to const on top of T as necessary. For example, +// it transforms +// +// char ==> const char& +// const char ==> const char& +// char& ==> const char& +// const char& ==> const char& +// +// The argument T must depend on some template parameters. +#define GTEST_REFERENCE_TO_CONST_(T) \ + GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T)) + +// ImplicitlyConvertible::value is a compile-time bool +// constant that's true iff type From can be implicitly converted to +// type To. +template +class ImplicitlyConvertible { + private: + // We need the following helper functions only for their types. + // They have no implementations. + + // MakeFrom() is an expression whose type is From. We cannot simply + // use From(), as the type From may not have a public default + // constructor. + static typename AddReference::type MakeFrom(); + + // These two functions are overloaded. Given an expression + // Helper(x), the compiler will pick the first version if x can be + // implicitly converted to type To; otherwise it will pick the + // second version. + // + // The first version returns a value of size 1, and the second + // version returns a value of size 2. Therefore, by checking the + // size of Helper(x), which can be done at compile time, we can tell + // which version of Helper() is used, and hence whether x can be + // implicitly converted to type To. + static char Helper(To); + static char (&Helper(...))[2]; // NOLINT + + // We have to put the 'public' section after the 'private' section, + // or MSVC refuses to compile the code. + public: +#if defined(__BORLANDC__) + // C++Builder cannot use member overload resolution during template + // instantiation. The simplest workaround is to use its C++0x type traits + // functions (C++Builder 2009 and above only). + static const bool value = __is_convertible(From, To); +#else + // MSVC warns about implicitly converting from double to int for + // possible loss of data, so we need to temporarily disable the + // warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244) + static const bool value = + sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif // __BORLANDC__ +}; +template +const bool ImplicitlyConvertible::value; + +// IsAProtocolMessage::value is a compile-time bool constant that's +// true iff T is type ProtocolMessage, proto2::Message, or a subclass +// of those. +template +struct IsAProtocolMessage + : public bool_constant< + ImplicitlyConvertible::value || + ImplicitlyConvertible::value> { +}; + +// When the compiler sees expression IsContainerTest(0), if C is an +// STL-style container class, the first overload of IsContainerTest +// will be viable (since both C::iterator* and C::const_iterator* are +// valid types and NULL can be implicitly converted to them). It will +// be picked over the second overload as 'int' is a perfect match for +// the type of argument 0. If C::iterator or C::const_iterator is not +// a valid type, the first overload is not viable, and the second +// overload will be picked. Therefore, we can determine whether C is +// a container class by checking the type of IsContainerTest(0). +// The value of the expression is insignificant. +// +// Note that we look for both C::iterator and C::const_iterator. The +// reason is that C++ injects the name of a class as a member of the +// class itself (e.g. you can refer to class iterator as either +// 'iterator' or 'iterator::iterator'). If we look for C::iterator +// only, for example, we would mistakenly think that a class named +// iterator is an STL container. +// +// Also note that the simpler approach of overloading +// IsContainerTest(typename C::const_iterator*) and +// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. +typedef int IsContainer; +template +IsContainer IsContainerTest(int /* dummy */, + typename C::iterator* /* it */ = NULL, + typename C::const_iterator* /* const_it */ = NULL) { + return 0; +} + +typedef char IsNotContainer; +template +IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } + +// EnableIf::type is void when 'Cond' is true, and +// undefined when 'Cond' is false. To use SFINAE to make a function +// overload only apply when a particular expression is true, add +// "typename EnableIf::type* = 0" as the last parameter. +template struct EnableIf; +template<> struct EnableIf { typedef void type; }; // NOLINT + +// Utilities for native arrays. + +// ArrayEq() compares two k-dimensional native arrays using the +// elements' operator==, where k can be any integer >= 0. When k is +// 0, ArrayEq() degenerates into comparing a single pair of values. + +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs); + +// This generic version is used when k is 0. +template +inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; } + +// This overload is used when k >= 1. +template +inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { + return internal::ArrayEq(lhs, N, rhs); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous ArrayEq() function, arrays with different sizes would +// lead to different copies of the template code. +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs) { + for (size_t i = 0; i != size; i++) { + if (!internal::ArrayEq(lhs[i], rhs[i])) + return false; + } + return true; +} + +// Finds the first element in the iterator range [begin, end) that +// equals elem. Element may be a native array type itself. +template +Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { + for (Iter it = begin; it != end; ++it) { + if (internal::ArrayEq(*it, elem)) + return it; + } + return end; +} + +// CopyArray() copies a k-dimensional native array using the elements' +// operator=, where k can be any integer >= 0. When k is 0, +// CopyArray() degenerates into copying a single value. + +template +void CopyArray(const T* from, size_t size, U* to); + +// This generic version is used when k is 0. +template +inline void CopyArray(const T& from, U* to) { *to = from; } + +// This overload is used when k >= 1. +template +inline void CopyArray(const T(&from)[N], U(*to)[N]) { + internal::CopyArray(from, N, *to); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous CopyArray() function, arrays with different sizes +// would lead to different copies of the template code. +template +void CopyArray(const T* from, size_t size, U* to) { + for (size_t i = 0; i != size; i++) { + internal::CopyArray(from[i], to + i); + } +} + +// The relation between an NativeArray object (see below) and the +// native array it represents. +// We use 2 different structs to allow non-copyable types to be used, as long +// as RelationToSourceReference() is passed. +struct RelationToSourceReference {}; +struct RelationToSourceCopy {}; + +// Adapts a native array to a read-only STL-style container. Instead +// of the complete STL container concept, this adaptor only implements +// members useful for Google Mock's container matchers. New members +// should be added as needed. To simplify the implementation, we only +// support Element being a raw type (i.e. having no top-level const or +// reference modifier). It's the client's responsibility to satisfy +// this requirement. Element can be an array type itself (hence +// multi-dimensional arrays are supported). +template +class NativeArray { + public: + // STL-style container typedefs. + typedef Element value_type; + typedef Element* iterator; + typedef const Element* const_iterator; + + // Constructs from a native array. References the source. + NativeArray(const Element* array, size_t count, RelationToSourceReference) { + InitRef(array, count); + } + + // Constructs from a native array. Copies the source. + NativeArray(const Element* array, size_t count, RelationToSourceCopy) { + InitCopy(array, count); + } + + // Copy constructor. + NativeArray(const NativeArray& rhs) { + (this->*rhs.clone_)(rhs.array_, rhs.size_); + } + + ~NativeArray() { + if (clone_ != &NativeArray::InitRef) + delete[] array_; + } + + // STL-style container methods. + size_t size() const { return size_; } + const_iterator begin() const { return array_; } + const_iterator end() const { return array_ + size_; } + bool operator==(const NativeArray& rhs) const { + return size() == rhs.size() && + ArrayEq(begin(), size(), rhs.begin()); + } + + private: + enum { + kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper< + Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value, + }; + + // Initializes this object with a copy of the input. + void InitCopy(const Element* array, size_t a_size) { + Element* const copy = new Element[a_size]; + CopyArray(array, a_size, copy); + array_ = copy; + size_ = a_size; + clone_ = &NativeArray::InitCopy; + } + + // Initializes this object with a reference of the input. + void InitRef(const Element* array, size_t a_size) { + array_ = array; + size_ = a_size; + clone_ = &NativeArray::InitRef; + } + + const Element* array_; + size_t size_; + void (NativeArray::*clone_)(const Element*, size_t); + + GTEST_DISALLOW_ASSIGN_(NativeArray); +}; + +} // namespace internal +} // namespace testing + +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) \ + = ::testing::Message() + +#define GTEST_MESSAGE_(message, result_type) \ + GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) + +#define GTEST_FATAL_FAILURE_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) + +#define GTEST_NONFATAL_FAILURE_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) + +#define GTEST_SUCCESS_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) + +// Suppresses MSVC warnings 4072 (unreachable code) for the code following +// statement if it returns or throws (or doesn't return or throw in some +// situations). +#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ + if (::testing::internal::AlwaysTrue()) { statement; } + +#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::ConstCharPtr gtest_msg = "") { \ + bool gtest_caught_expected = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (expected_exception const&) { \ + gtest_caught_expected = true; \ + } \ + catch (...) { \ + gtest_msg.value = \ + "Expected: " #statement " throws an exception of type " \ + #expected_exception ".\n Actual: it throws a different type."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + if (!gtest_caught_expected) { \ + gtest_msg.value = \ + "Expected: " #statement " throws an exception of type " \ + #expected_exception ".\n Actual: it throws nothing."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \ + fail(gtest_msg.value) + +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (...) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ + fail("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: it throws.") + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \ + fail("Expected: " #statement " throws an exception.\n" \ + " Actual: it doesn't.") + + +// Implements Boolean test assertions such as EXPECT_TRUE. expression can be +// either a boolean expression or an AssertionResult. text is a textual +// represenation of expression as it was passed into the EXPECT_TRUE. +#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage(\ + gtest_ar_, text, #actual, #expected).c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \ + fail("Expected: " #statement " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") + +// Expands to the name of the class that implements the given test. +#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + test_case_name##_##test_name##_Test + +// Helper macro for defining tests. +#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\ +class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ + public:\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ + private:\ + virtual void TestBody();\ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ +};\ +\ +::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\ + ::test_info_ =\ + ::testing::internal::MakeAndRegisterTestInfo(\ + #test_case_name, #test_name, NULL, NULL, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + (parent_id), \ + parent_class::SetUpTestCase, \ + parent_class::TearDownTestCase, \ + new ::testing::internal::TestFactoryImpl<\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ +void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ + diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h new file mode 100644 index 0000000000..3602942217 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h @@ -0,0 +1,243 @@ +// Copyright 2003 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Dan Egnor (egnor@google.com) +// +// A "smart" pointer type with reference tracking. Every pointer to a +// particular object is kept on a circular linked list. When the last pointer +// to an object is destroyed or reassigned, the object is deleted. +// +// Used properly, this deletes the object when the last reference goes away. +// There are several caveats: +// - Like all reference counting schemes, cycles lead to leaks. +// - Each smart pointer is actually two pointers (8 bytes instead of 4). +// - Every time a pointer is assigned, the entire list of pointers to that +// object is traversed. This class is therefore NOT SUITABLE when there +// will often be more than two or three pointers to a particular object. +// - References are only tracked as long as linked_ptr<> objects are copied. +// If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS +// will happen (double deletion). +// +// A good use of this class is storing object references in STL containers. +// You can safely put linked_ptr<> in a vector<>. +// Other uses may not be as good. +// +// Note: If you use an incomplete type with linked_ptr<>, the class +// *containing* linked_ptr<> must have a constructor and destructor (even +// if they do nothing!). +// +// Bill Gibbons suggested we use something like this. +// +// Thread Safety: +// Unlike other linked_ptr implementations, in this implementation +// a linked_ptr object is thread-safe in the sense that: +// - it's safe to copy linked_ptr objects concurrently, +// - it's safe to copy *from* a linked_ptr and read its underlying +// raw pointer (e.g. via get()) concurrently, and +// - it's safe to write to two linked_ptrs that point to the same +// shared object concurrently. +// TODO(wan@google.com): rename this to safe_linked_ptr to avoid +// confusion with normal linked_ptr. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ + +#include +#include + +#include "gtest/internal/gtest-port.h" + +namespace testing { +namespace internal { + +// Protects copying of all linked_ptr objects. +GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex); + +// This is used internally by all instances of linked_ptr<>. It needs to be +// a non-template class because different types of linked_ptr<> can refer to +// the same object (linked_ptr(obj) vs linked_ptr(obj)). +// So, it needs to be possible for different types of linked_ptr to participate +// in the same circular linked list, so we need a single class type here. +// +// DO NOT USE THIS CLASS DIRECTLY YOURSELF. Use linked_ptr. +class linked_ptr_internal { + public: + // Create a new circle that includes only this instance. + void join_new() { + next_ = this; + } + + // Many linked_ptr operations may change p.link_ for some linked_ptr + // variable p in the same circle as this object. Therefore we need + // to prevent two such operations from occurring concurrently. + // + // Note that different types of linked_ptr objects can coexist in a + // circle (e.g. linked_ptr, linked_ptr, and + // linked_ptr). Therefore we must use a single mutex to + // protect all linked_ptr objects. This can create serious + // contention in production code, but is acceptable in a testing + // framework. + + // Join an existing circle. + void join(linked_ptr_internal const* ptr) + GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { + MutexLock lock(&g_linked_ptr_mutex); + + linked_ptr_internal const* p = ptr; + while (p->next_ != ptr) { + assert(p->next_ != this && + "Trying to join() a linked ring we are already in. " + "Is GMock thread safety enabled?"); + p = p->next_; + } + p->next_ = this; + next_ = ptr; + } + + // Leave whatever circle we're part of. Returns true if we were the + // last member of the circle. Once this is done, you can join() another. + bool depart() + GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { + MutexLock lock(&g_linked_ptr_mutex); + + if (next_ == this) return true; + linked_ptr_internal const* p = next_; + while (p->next_ != this) { + assert(p->next_ != next_ && + "Trying to depart() a linked ring we are not in. " + "Is GMock thread safety enabled?"); + p = p->next_; + } + p->next_ = next_; + return false; + } + + private: + mutable linked_ptr_internal const* next_; +}; + +template +class linked_ptr { + public: + typedef T element_type; + + // Take over ownership of a raw pointer. This should happen as soon as + // possible after the object is created. + explicit linked_ptr(T* ptr = NULL) { capture(ptr); } + ~linked_ptr() { depart(); } + + // Copy an existing linked_ptr<>, adding ourselves to the list of references. + template linked_ptr(linked_ptr const& ptr) { copy(&ptr); } + linked_ptr(linked_ptr const& ptr) { // NOLINT + assert(&ptr != this); + copy(&ptr); + } + + // Assignment releases the old value and acquires the new. + template linked_ptr& operator=(linked_ptr const& ptr) { + depart(); + copy(&ptr); + return *this; + } + + linked_ptr& operator=(linked_ptr const& ptr) { + if (&ptr != this) { + depart(); + copy(&ptr); + } + return *this; + } + + // Smart pointer members. + void reset(T* ptr = NULL) { + depart(); + capture(ptr); + } + T* get() const { return value_; } + T* operator->() const { return value_; } + T& operator*() const { return *value_; } + + bool operator==(T* p) const { return value_ == p; } + bool operator!=(T* p) const { return value_ != p; } + template + bool operator==(linked_ptr const& ptr) const { + return value_ == ptr.get(); + } + template + bool operator!=(linked_ptr const& ptr) const { + return value_ != ptr.get(); + } + + private: + template + friend class linked_ptr; + + T* value_; + linked_ptr_internal link_; + + void depart() { + if (link_.depart()) delete value_; + } + + void capture(T* ptr) { + value_ = ptr; + link_.join_new(); + } + + template void copy(linked_ptr const* ptr) { + value_ = ptr->get(); + if (value_) + link_.join(&ptr->link_); + else + link_.join_new(); + } +}; + +template inline +bool operator==(T* ptr, const linked_ptr& x) { + return ptr == x.get(); +} + +template inline +bool operator!=(T* ptr, const linked_ptr& x) { + return ptr != x.get(); +} + +// A function to convert T* into linked_ptr +// Doing e.g. make_linked_ptr(new FooBarBaz(arg)) is a shorter notation +// for linked_ptr >(new FooBarBaz(arg)) +template +linked_ptr make_linked_ptr(T* ptr) { + return linked_ptr(ptr); +} + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h new file mode 100644 index 0000000000..4d1d81d20f --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h @@ -0,0 +1,5146 @@ +// This file was GENERATED by command: +// pump.py gtest-param-util-generated.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: vladl@google.com (Vlad Losev) + +// Type and function utilities for implementing parameterized tests. +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently Google Test supports at most 50 arguments in Values, +// and at most 10 arguments in Combine. Please contact +// googletestframework@googlegroups.com if you need more. +// Please note that the number of arguments to Combine is limited +// by the maximum arity of the implementation of tuple which is +// currently set at 10. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +#include "gtest/internal/gtest-param-util.h" +#include "gtest/internal/gtest-port.h" + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end); + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]); + +template +internal::ParamGenerator ValuesIn( + const Container& container); + +namespace internal { + +// Used in the Values() function to provide polymorphic capabilities. +template +class ValueArray1 { + public: + explicit ValueArray1(T1 v1) : v1_(v1) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray1& other); + + const T1 v1_; +}; + +template +class ValueArray2 { + public: + ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray2& other); + + const T1 v1_; + const T2 v2_; +}; + +template +class ValueArray3 { + public: + ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray3& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; +}; + +template +class ValueArray4 { + public: + ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray4& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; +}; + +template +class ValueArray5 { + public: + ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray5& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; +}; + +template +class ValueArray6 { + public: + ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray6& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; +}; + +template +class ValueArray7 { + public: + ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray7& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; +}; + +template +class ValueArray8 { + public: + ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray8& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; +}; + +template +class ValueArray9 { + public: + ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray9& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; +}; + +template +class ValueArray10 { + public: + ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray10& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; +}; + +template +class ValueArray11 { + public: + ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray11& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; +}; + +template +class ValueArray12 { + public: + ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray12& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; +}; + +template +class ValueArray13 { + public: + ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray13& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; +}; + +template +class ValueArray14 { + public: + ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray14& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; +}; + +template +class ValueArray15 { + public: + ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray15& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; +}; + +template +class ValueArray16 { + public: + ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray16& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; +}; + +template +class ValueArray17 { + public: + ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray17& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; +}; + +template +class ValueArray18 { + public: + ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray18& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; +}; + +template +class ValueArray19 { + public: + ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray19& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; +}; + +template +class ValueArray20 { + public: + ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray20& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; +}; + +template +class ValueArray21 { + public: + ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray21& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; +}; + +template +class ValueArray22 { + public: + ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray22& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; +}; + +template +class ValueArray23 { + public: + ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray23& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; +}; + +template +class ValueArray24 { + public: + ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray24& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; +}; + +template +class ValueArray25 { + public: + ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray25& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; +}; + +template +class ValueArray26 { + public: + ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray26& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; +}; + +template +class ValueArray27 { + public: + ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray27& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; +}; + +template +class ValueArray28 { + public: + ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray28& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; +}; + +template +class ValueArray29 { + public: + ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray29& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; +}; + +template +class ValueArray30 { + public: + ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray30& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; +}; + +template +class ValueArray31 { + public: + ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray31& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; +}; + +template +class ValueArray32 { + public: + ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray32& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; +}; + +template +class ValueArray33 { + public: + ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, + T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray33& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; +}; + +template +class ValueArray34 { + public: + ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray34& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; +}; + +template +class ValueArray35 { + public: + ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), + v32_(v32), v33_(v33), v34_(v34), v35_(v35) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray35& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; +}; + +template +class ValueArray36 { + public: + ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), + v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray36& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; +}; + +template +class ValueArray37 { + public: + ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), + v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), + v36_(v36), v37_(v37) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray37& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; +}; + +template +class ValueArray38 { + public: + ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray38& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; +}; + +template +class ValueArray39 { + public: + ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray39& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; +}; + +template +class ValueArray40 { + public: + ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), + v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), + v40_(v40) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray40& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; +}; + +template +class ValueArray41 { + public: + ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, + T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray41& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; +}; + +template +class ValueArray42 { + public: + ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray42& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; +}; + +template +class ValueArray43 { + public: + ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), + v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), + v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray43& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; +}; + +template +class ValueArray44 { + public: + ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), + v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), + v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), + v43_(v43), v44_(v44) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray44& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; +}; + +template +class ValueArray45 { + public: + ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), + v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), + v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), + v42_(v42), v43_(v43), v44_(v44), v45_(v45) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray45& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; +}; + +template +class ValueArray46 { + public: + ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), + v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray46& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; +}; + +template +class ValueArray47 { + public: + ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), + v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46), + v47_(v47) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_), static_cast(v47_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray47& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; +}; + +template +class ValueArray48 { + public: + ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), + v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), + v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), + v46_(v46), v47_(v47), v48_(v48) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_), static_cast(v47_), + static_cast(v48_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray48& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; +}; + +template +class ValueArray49 { + public: + ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, + T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), + v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_), static_cast(v47_), + static_cast(v48_), static_cast(v49_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray49& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; + const T49 v49_; +}; + +template +class ValueArray50 { + public: + ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49, + T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), + v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {} + + template + operator ParamGenerator() const { + const T array[] = {static_cast(v1_), static_cast(v2_), + static_cast(v3_), static_cast(v4_), static_cast(v5_), + static_cast(v6_), static_cast(v7_), static_cast(v8_), + static_cast(v9_), static_cast(v10_), static_cast(v11_), + static_cast(v12_), static_cast(v13_), static_cast(v14_), + static_cast(v15_), static_cast(v16_), static_cast(v17_), + static_cast(v18_), static_cast(v19_), static_cast(v20_), + static_cast(v21_), static_cast(v22_), static_cast(v23_), + static_cast(v24_), static_cast(v25_), static_cast(v26_), + static_cast(v27_), static_cast(v28_), static_cast(v29_), + static_cast(v30_), static_cast(v31_), static_cast(v32_), + static_cast(v33_), static_cast(v34_), static_cast(v35_), + static_cast(v36_), static_cast(v37_), static_cast(v38_), + static_cast(v39_), static_cast(v40_), static_cast(v41_), + static_cast(v42_), static_cast(v43_), static_cast(v44_), + static_cast(v45_), static_cast(v46_), static_cast(v47_), + static_cast(v48_), static_cast(v49_), static_cast(v50_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray50& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; + const T49 v49_; + const T50 v50_; +}; + +# if GTEST_HAS_COMBINE +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Generates values from the Cartesian product of values produced +// by the argument generators. +// +template +class CartesianProductGenerator2 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator2(const ParamGenerator& g1, + const ParamGenerator& g2) + : g1_(g1), g2_(g2) {} + virtual ~CartesianProductGenerator2() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current2_; + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + ParamType current_value_; + }; // class CartesianProductGenerator2::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator2& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; +}; // class CartesianProductGenerator2 + + +template +class CartesianProductGenerator3 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator3(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3) + : g1_(g1), g2_(g2), g3_(g3) {} + virtual ~CartesianProductGenerator3() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current3_; + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + ParamType current_value_; + }; // class CartesianProductGenerator3::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator3& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; +}; // class CartesianProductGenerator3 + + +template +class CartesianProductGenerator4 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator4(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} + virtual ~CartesianProductGenerator4() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current4_; + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + ParamType current_value_; + }; // class CartesianProductGenerator4::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator4& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; +}; // class CartesianProductGenerator4 + + +template +class CartesianProductGenerator5 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator5(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} + virtual ~CartesianProductGenerator5() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current5_; + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + ParamType current_value_; + }; // class CartesianProductGenerator5::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator5& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; +}; // class CartesianProductGenerator5 + + +template +class CartesianProductGenerator6 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator6(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} + virtual ~CartesianProductGenerator6() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current6_; + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + ParamType current_value_; + }; // class CartesianProductGenerator6::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator6& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; +}; // class CartesianProductGenerator6 + + +template +class CartesianProductGenerator7 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator7(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} + virtual ~CartesianProductGenerator7() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current7_; + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + ParamType current_value_; + }; // class CartesianProductGenerator7::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator7& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; +}; // class CartesianProductGenerator7 + + +template +class CartesianProductGenerator8 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator8(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), + g8_(g8) {} + virtual ~CartesianProductGenerator8() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current8_; + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + ParamType current_value_; + }; // class CartesianProductGenerator8::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator8& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; +}; // class CartesianProductGenerator8 + + +template +class CartesianProductGenerator9 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator9(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8, const ParamGenerator& g9) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9) {} + virtual ~CartesianProductGenerator9() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end(), g9_, g9_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8, + const ParamGenerator& g9, + const typename ParamGenerator::iterator& current9) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8), + begin9_(g9.begin()), end9_(g9.end()), current9_(current9) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current9_; + if (current9_ == end9_) { + current9_ = begin9_; + ++current8_; + } + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_ && + current9_ == typed_other->current9_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_), + begin9_(other.begin9_), + end9_(other.end9_), + current9_(other.current9_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_, + *current9_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_ || + current9_ == end9_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + const typename ParamGenerator::iterator begin9_; + const typename ParamGenerator::iterator end9_; + typename ParamGenerator::iterator current9_; + ParamType current_value_; + }; // class CartesianProductGenerator9::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator9& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; + const ParamGenerator g9_; +}; // class CartesianProductGenerator9 + + +template +class CartesianProductGenerator10 + : public ParamGeneratorInterface< ::testing::tuple > { + public: + typedef ::testing::tuple ParamType; + + CartesianProductGenerator10(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8, const ParamGenerator& g9, + const ParamGenerator& g10) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9), g10_(g10) {} + virtual ~CartesianProductGenerator10() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end(), g9_, g9_.end(), g10_, g10_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8, + const ParamGenerator& g9, + const typename ParamGenerator::iterator& current9, + const ParamGenerator& g10, + const typename ParamGenerator::iterator& current10) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8), + begin9_(g9.begin()), end9_(g9.end()), current9_(current9), + begin10_(g10.begin()), end10_(g10.end()), current10_(current10) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current10_; + if (current10_ == end10_) { + current10_ = begin10_; + ++current9_; + } + if (current9_ == end9_) { + current9_ = begin9_; + ++current8_; + } + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_ && + current9_ == typed_other->current9_ && + current10_ == typed_other->current10_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_), + begin9_(other.begin9_), + end9_(other.end9_), + current9_(other.current9_), + begin10_(other.begin10_), + end10_(other.end10_), + current10_(other.current10_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_, + *current9_, *current10_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_ || + current9_ == end9_ || + current10_ == end10_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + const typename ParamGenerator::iterator begin9_; + const typename ParamGenerator::iterator end9_; + typename ParamGenerator::iterator current9_; + const typename ParamGenerator::iterator begin10_; + const typename ParamGenerator::iterator end10_; + typename ParamGenerator::iterator current10_; + ParamType current_value_; + }; // class CartesianProductGenerator10::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator10& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; + const ParamGenerator g9_; + const ParamGenerator g10_; +}; // class CartesianProductGenerator10 + + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Helper classes providing Combine() with polymorphic features. They allow +// casting CartesianProductGeneratorN to ParamGenerator if T is +// convertible to U. +// +template +class CartesianProductHolder2 { + public: +CartesianProductHolder2(const Generator1& g1, const Generator2& g2) + : g1_(g1), g2_(g2) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator2( + static_cast >(g1_), + static_cast >(g2_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder2& other); + + const Generator1 g1_; + const Generator2 g2_; +}; // class CartesianProductHolder2 + +template +class CartesianProductHolder3 { + public: +CartesianProductHolder3(const Generator1& g1, const Generator2& g2, + const Generator3& g3) + : g1_(g1), g2_(g2), g3_(g3) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator3( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder3& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; +}; // class CartesianProductHolder3 + +template +class CartesianProductHolder4 { + public: +CartesianProductHolder4(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator4( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder4& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; +}; // class CartesianProductHolder4 + +template +class CartesianProductHolder5 { + public: +CartesianProductHolder5(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator5( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder5& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; +}; // class CartesianProductHolder5 + +template +class CartesianProductHolder6 { + public: +CartesianProductHolder6(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator6( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder6& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; +}; // class CartesianProductHolder6 + +template +class CartesianProductHolder7 { + public: +CartesianProductHolder7(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator7( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder7& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; +}; // class CartesianProductHolder7 + +template +class CartesianProductHolder8 { + public: +CartesianProductHolder8(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), + g8_(g8) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator8( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder8& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; +}; // class CartesianProductHolder8 + +template +class CartesianProductHolder9 { + public: +CartesianProductHolder9(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8, + const Generator9& g9) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator9( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_), + static_cast >(g9_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder9& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; + const Generator9 g9_; +}; // class CartesianProductHolder9 + +template +class CartesianProductHolder10 { + public: +CartesianProductHolder10(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8, + const Generator9& g9, const Generator10& g10) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9), g10_(g10) {} + template + operator ParamGenerator< ::testing::tuple >() const { + return ParamGenerator< ::testing::tuple >( + new CartesianProductGenerator10( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_), + static_cast >(g9_), + static_cast >(g10_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder10& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; + const Generator9 g9_; + const Generator10 g10_; +}; // class CartesianProductHolder10 + +# endif // GTEST_HAS_COMBINE + +} // namespace internal +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump new file mode 100644 index 0000000000..5c7c47af0b --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump @@ -0,0 +1,286 @@ +$$ -*- mode: c++; -*- +$var n = 50 $$ Maximum length of Values arguments we want to support. +$var maxtuple = 10 $$ Maximum number of Combine arguments we want to support. +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: vladl@google.com (Vlad Losev) + +// Type and function utilities for implementing parameterized tests. +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently Google Test supports at most $n arguments in Values, +// and at most $maxtuple arguments in Combine. Please contact +// googletestframework@googlegroups.com if you need more. +// Please note that the number of arguments to Combine is limited +// by the maximum arity of the implementation of tuple which is +// currently set at $maxtuple. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +#include "gtest/internal/gtest-param-util.h" +#include "gtest/internal/gtest-port.h" + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end); + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]); + +template +internal::ParamGenerator ValuesIn( + const Container& container); + +namespace internal { + +// Used in the Values() function to provide polymorphic capabilities. +$range i 1..n +$for i [[ +$range j 1..i + +template <$for j, [[typename T$j]]> +class ValueArray$i { + public: + $if i==1 [[explicit ]]ValueArray$i($for j, [[T$j v$j]]) : $for j, [[v$(j)_(v$j)]] {} + + template + operator ParamGenerator() const { + const T array[] = {$for j, [[static_cast(v$(j)_)]]}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray$i& other); + +$for j [[ + + const T$j v$(j)_; +]] + +}; + +]] + +# if GTEST_HAS_COMBINE +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Generates values from the Cartesian product of values produced +// by the argument generators. +// +$range i 2..maxtuple +$for i [[ +$range j 1..i +$range k 2..i + +template <$for j, [[typename T$j]]> +class CartesianProductGenerator$i + : public ParamGeneratorInterface< ::testing::tuple<$for j, [[T$j]]> > { + public: + typedef ::testing::tuple<$for j, [[T$j]]> ParamType; + + CartesianProductGenerator$i($for j, [[const ParamGenerator& g$j]]) + : $for j, [[g$(j)_(g$j)]] {} + virtual ~CartesianProductGenerator$i() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, $for j, [[g$(j)_, g$(j)_.begin()]]); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, $for j, [[g$(j)_, g$(j)_.end()]]); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, $for j, [[ + + const ParamGenerator& g$j, + const typename ParamGenerator::iterator& current$(j)]]) + : base_(base), +$for j, [[ + + begin$(j)_(g$j.begin()), end$(j)_(g$j.end()), current$(j)_(current$j) +]] { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current$(i)_; + +$for k [[ + if (current$(i+2-k)_ == end$(i+2-k)_) { + current$(i+2-k)_ = begin$(i+2-k)_; + ++current$(i+2-k-1)_; + } + +]] + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ($for j && [[ + + current$(j)_ == typed_other->current$(j)_ +]]); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), $for j, [[ + + begin$(j)_(other.begin$(j)_), + end$(j)_(other.end$(j)_), + current$(j)_(other.current$(j)_) +]] { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType($for j, [[*current$(j)_]]); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return +$for j || [[ + + current$(j)_ == end$(j)_ +]]; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. +$for j [[ + + const typename ParamGenerator::iterator begin$(j)_; + const typename ParamGenerator::iterator end$(j)_; + typename ParamGenerator::iterator current$(j)_; +]] + + ParamType current_value_; + }; // class CartesianProductGenerator$i::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator$i& other); + + +$for j [[ + const ParamGenerator g$(j)_; + +]] +}; // class CartesianProductGenerator$i + + +]] + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Helper classes providing Combine() with polymorphic features. They allow +// casting CartesianProductGeneratorN to ParamGenerator if T is +// convertible to U. +// +$range i 2..maxtuple +$for i [[ +$range j 1..i + +template <$for j, [[class Generator$j]]> +class CartesianProductHolder$i { + public: +CartesianProductHolder$i($for j, [[const Generator$j& g$j]]) + : $for j, [[g$(j)_(g$j)]] {} + template <$for j, [[typename T$j]]> + operator ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >() const { + return ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >( + new CartesianProductGenerator$i<$for j, [[T$j]]>( +$for j,[[ + + static_cast >(g$(j)_) +]])); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder$i& other); + + +$for j [[ + const Generator$j g$(j)_; + +]] +}; // class CartesianProductHolder$i + +]] + +# endif // GTEST_HAS_COMBINE + +} // namespace internal +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h new file mode 100644 index 0000000000..82cab9b020 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -0,0 +1,731 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: vladl@google.com (Vlad Losev) + +// Type and function utilities for implementing parameterized tests. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ + +#include + +#include +#include +#include +#include + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-linked_ptr.h" +#include "gtest/internal/gtest-port.h" +#include "gtest/gtest-printers.h" + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Input to a parameterized test name generator, describing a test parameter. +// Consists of the parameter value and the integer parameter index. +template +struct TestParamInfo { + TestParamInfo(const ParamType& a_param, size_t an_index) : + param(a_param), + index(an_index) {} + ParamType param; + size_t index; +}; + +// A builtin parameterized test name generator which returns the result of +// testing::PrintToString. +struct PrintToStringParamName { + template + std::string operator()(const TestParamInfo& info) const { + return PrintToString(info.param); + } +}; + +namespace internal { + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Outputs a message explaining invalid registration of different +// fixture class for the same test case. This may happen when +// TEST_P macro is used to define two tests with the same name +// but in different namespaces. +GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name, + CodeLocation code_location); + +template class ParamGeneratorInterface; +template class ParamGenerator; + +// Interface for iterating over elements provided by an implementation +// of ParamGeneratorInterface. +template +class ParamIteratorInterface { + public: + virtual ~ParamIteratorInterface() {} + // A pointer to the base generator instance. + // Used only for the purposes of iterator comparison + // to make sure that two iterators belong to the same generator. + virtual const ParamGeneratorInterface* BaseGenerator() const = 0; + // Advances iterator to point to the next element + // provided by the generator. The caller is responsible + // for not calling Advance() on an iterator equal to + // BaseGenerator()->End(). + virtual void Advance() = 0; + // Clones the iterator object. Used for implementing copy semantics + // of ParamIterator. + virtual ParamIteratorInterface* Clone() const = 0; + // Dereferences the current iterator and provides (read-only) access + // to the pointed value. It is the caller's responsibility not to call + // Current() on an iterator equal to BaseGenerator()->End(). + // Used for implementing ParamGenerator::operator*(). + virtual const T* Current() const = 0; + // Determines whether the given iterator and other point to the same + // element in the sequence generated by the generator. + // Used for implementing ParamGenerator::operator==(). + virtual bool Equals(const ParamIteratorInterface& other) const = 0; +}; + +// Class iterating over elements provided by an implementation of +// ParamGeneratorInterface. It wraps ParamIteratorInterface +// and implements the const forward iterator concept. +template +class ParamIterator { + public: + typedef T value_type; + typedef const T& reference; + typedef ptrdiff_t difference_type; + + // ParamIterator assumes ownership of the impl_ pointer. + ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} + ParamIterator& operator=(const ParamIterator& other) { + if (this != &other) + impl_.reset(other.impl_->Clone()); + return *this; + } + + const T& operator*() const { return *impl_->Current(); } + const T* operator->() const { return impl_->Current(); } + // Prefix version of operator++. + ParamIterator& operator++() { + impl_->Advance(); + return *this; + } + // Postfix version of operator++. + ParamIterator operator++(int /*unused*/) { + ParamIteratorInterface* clone = impl_->Clone(); + impl_->Advance(); + return ParamIterator(clone); + } + bool operator==(const ParamIterator& other) const { + return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); + } + bool operator!=(const ParamIterator& other) const { + return !(*this == other); + } + + private: + friend class ParamGenerator; + explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} + scoped_ptr > impl_; +}; + +// ParamGeneratorInterface is the binary interface to access generators +// defined in other translation units. +template +class ParamGeneratorInterface { + public: + typedef T ParamType; + + virtual ~ParamGeneratorInterface() {} + + // Generator interface definition + virtual ParamIteratorInterface* Begin() const = 0; + virtual ParamIteratorInterface* End() const = 0; +}; + +// Wraps ParamGeneratorInterface and provides general generator syntax +// compatible with the STL Container concept. +// This class implements copy initialization semantics and the contained +// ParamGeneratorInterface instance is shared among all copies +// of the original object. This is possible because that instance is immutable. +template +class ParamGenerator { + public: + typedef ParamIterator iterator; + + explicit ParamGenerator(ParamGeneratorInterface* impl) : impl_(impl) {} + ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {} + + ParamGenerator& operator=(const ParamGenerator& other) { + impl_ = other.impl_; + return *this; + } + + iterator begin() const { return iterator(impl_->Begin()); } + iterator end() const { return iterator(impl_->End()); } + + private: + linked_ptr > impl_; +}; + +// Generates values from a range of two comparable values. Can be used to +// generate sequences of user-defined types that implement operator+() and +// operator<(). +// This class is used in the Range() function. +template +class RangeGenerator : public ParamGeneratorInterface { + public: + RangeGenerator(T begin, T end, IncrementT step) + : begin_(begin), end_(end), + step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} + virtual ~RangeGenerator() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, begin_, 0, step_); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, end_, end_index_, step_); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, T value, int index, + IncrementT step) + : base_(base), value_(value), index_(index), step_(step) {} + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + virtual void Advance() { + value_ = static_cast(value_ + step_); + index_++; + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const T* Current() const { return &value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const int other_index = + CheckedDowncastToActualType(&other)->index_; + return index_ == other_index; + } + + private: + Iterator(const Iterator& other) + : ParamIteratorInterface(), + base_(other.base_), value_(other.value_), index_(other.index_), + step_(other.step_) {} + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + T value_; + int index_; + const IncrementT step_; + }; // class RangeGenerator::Iterator + + static int CalculateEndIndex(const T& begin, + const T& end, + const IncrementT& step) { + int end_index = 0; + for (T i = begin; i < end; i = static_cast(i + step)) + end_index++; + return end_index; + } + + // No implementation - assignment is unsupported. + void operator=(const RangeGenerator& other); + + const T begin_; + const T end_; + const IncrementT step_; + // The index for the end() iterator. All the elements in the generated + // sequence are indexed (0-based) to aid iterator comparison. + const int end_index_; +}; // class RangeGenerator + + +// Generates values from a pair of STL-style iterators. Used in the +// ValuesIn() function. The elements are copied from the source range +// since the source can be located on the stack, and the generator +// is likely to persist beyond that stack frame. +template +class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { + public: + template + ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) + : container_(begin, end) {} + virtual ~ValuesInIteratorRangeGenerator() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, container_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, container_.end()); + } + + private: + typedef typename ::std::vector ContainerType; + + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + typename ContainerType::const_iterator iterator) + : base_(base), iterator_(iterator) {} + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + virtual void Advance() { + ++iterator_; + value_.reset(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + // We need to use cached value referenced by iterator_ because *iterator_ + // can return a temporary object (and of type other then T), so just + // having "return &*iterator_;" doesn't work. + // value_ is updated here and not in Advance() because Advance() + // can advance iterator_ beyond the end of the range, and we cannot + // detect that fact. The client code, on the other hand, is + // responsible for not calling Current() on an out-of-range iterator. + virtual const T* Current() const { + if (value_.get() == NULL) + value_.reset(new T(*iterator_)); + return value_.get(); + } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + return iterator_ == + CheckedDowncastToActualType(&other)->iterator_; + } + + private: + Iterator(const Iterator& other) + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. + : ParamIteratorInterface(), + base_(other.base_), + iterator_(other.iterator_) {} + + const ParamGeneratorInterface* const base_; + typename ContainerType::const_iterator iterator_; + // A cached value of *iterator_. We keep it here to allow access by + // pointer in the wrapping iterator's operator->(). + // value_ needs to be mutable to be accessed in Current(). + // Use of scoped_ptr helps manage cached value's lifetime, + // which is bound by the lifespan of the iterator itself. + mutable scoped_ptr value_; + }; // class ValuesInIteratorRangeGenerator::Iterator + + // No implementation - assignment is unsupported. + void operator=(const ValuesInIteratorRangeGenerator& other); + + const ContainerType container_; +}; // class ValuesInIteratorRangeGenerator + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Default parameterized test name generator, returns a string containing the +// integer test parameter index. +template +std::string DefaultParamName(const TestParamInfo& info) { + Message name_stream; + name_stream << info.index; + return name_stream.GetString(); +} + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Parameterized test name overload helpers, which help the +// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized +// test name generator and user param name generator. +template +ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) { + return func; +} + +template +struct ParamNameGenFunc { + typedef std::string Type(const TestParamInfo&); +}; + +template +typename ParamNameGenFunc::Type *GetParamNameGen() { + return DefaultParamName; +} + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Stores a parameter value and later creates tests parameterized with that +// value. +template +class ParameterizedTestFactory : public TestFactoryBase { + public: + typedef typename TestClass::ParamType ParamType; + explicit ParameterizedTestFactory(ParamType parameter) : + parameter_(parameter) {} + virtual Test* CreateTest() { + TestClass::SetParam(¶meter_); + return new TestClass(); + } + + private: + const ParamType parameter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactoryBase is a base class for meta-factories that create +// test factories for passing into MakeAndRegisterTestInfo function. +template +class TestMetaFactoryBase { + public: + virtual ~TestMetaFactoryBase() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactory creates test factories for passing into +// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives +// ownership of test factory pointer, same factory object cannot be passed +// into that method twice. But ParameterizedTestCaseInfo is going to call +// it for each Test/Parameter value combination. Thus it needs meta factory +// creator class. +template +class TestMetaFactory + : public TestMetaFactoryBase { + public: + typedef typename TestCase::ParamType ParamType; + + TestMetaFactory() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) { + return new ParameterizedTestFactory(parameter); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseInfoBase is a generic interface +// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase +// accumulates test information provided by TEST_P macro invocations +// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations +// and uses that information to register all resulting test instances +// in RegisterTests method. The ParameterizeTestCaseRegistry class holds +// a collection of pointers to the ParameterizedTestCaseInfo objects +// and calls RegisterTests() on each of them when asked. +class ParameterizedTestCaseInfoBase { + public: + virtual ~ParameterizedTestCaseInfoBase() {} + + // Base part of test case name for display purposes. + virtual const string& GetTestCaseName() const = 0; + // Test case id to verify identity. + virtual TypeId GetTestCaseTypeId() const = 0; + // UnitTest class invokes this method to register tests in this + // test case right before running them in RUN_ALL_TESTS macro. + // This method should not be called more then once on any single + // instance of a ParameterizedTestCaseInfoBase derived class. + virtual void RegisterTests() = 0; + + protected: + ParameterizedTestCaseInfoBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P +// macro invocations for a particular test case and generators +// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that +// test case. It registers tests with all values generated by all +// generators when asked. +template +class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { + public: + // ParamType and GeneratorCreationFunc are private types but are required + // for declarations of public methods AddTestPattern() and + // AddTestCaseInstantiation(). + typedef typename TestCase::ParamType ParamType; + // A function that returns an instance of appropriate generator type. + typedef ParamGenerator(GeneratorCreationFunc)(); + typedef typename ParamNameGenFunc::Type ParamNameGeneratorFunc; + + explicit ParameterizedTestCaseInfo( + const char* name, CodeLocation code_location) + : test_case_name_(name), code_location_(code_location) {} + + // Test case base name for display purposes. + virtual const string& GetTestCaseName() const { return test_case_name_; } + // Test case id to verify identity. + virtual TypeId GetTestCaseTypeId() const { return GetTypeId(); } + // TEST_P macro uses AddTestPattern() to record information + // about a single test in a LocalTestInfo structure. + // test_case_name is the base name of the test case (without invocation + // prefix). test_base_name is the name of an individual test without + // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is + // test case base name and DoBar is test base name. + void AddTestPattern(const char* test_case_name, + const char* test_base_name, + TestMetaFactoryBase* meta_factory) { + tests_.push_back(linked_ptr(new TestInfo(test_case_name, + test_base_name, + meta_factory))); + } + // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information + // about a generator. + int AddTestCaseInstantiation(const string& instantiation_name, + GeneratorCreationFunc* func, + ParamNameGeneratorFunc* name_func, + const char* file, + int line) { + instantiations_.push_back( + InstantiationInfo(instantiation_name, func, name_func, file, line)); + return 0; // Return value used only to run this method in namespace scope. + } + // UnitTest class invokes this method to register tests in this test case + // test cases right before running tests in RUN_ALL_TESTS macro. + // This method should not be called more then once on any single + // instance of a ParameterizedTestCaseInfoBase derived class. + // UnitTest has a guard to prevent from calling this method more then once. + virtual void RegisterTests() { + for (typename TestInfoContainer::iterator test_it = tests_.begin(); + test_it != tests_.end(); ++test_it) { + linked_ptr test_info = *test_it; + for (typename InstantiationContainer::iterator gen_it = + instantiations_.begin(); gen_it != instantiations_.end(); + ++gen_it) { + const string& instantiation_name = gen_it->name; + ParamGenerator generator((*gen_it->generator)()); + ParamNameGeneratorFunc* name_func = gen_it->name_func; + const char* file = gen_it->file; + int line = gen_it->line; + + string test_case_name; + if ( !instantiation_name.empty() ) + test_case_name = instantiation_name + "/"; + test_case_name += test_info->test_case_base_name; + + size_t i = 0; + std::set test_param_names; + for (typename ParamGenerator::iterator param_it = + generator.begin(); + param_it != generator.end(); ++param_it, ++i) { + Message test_name_stream; + + std::string param_name = name_func( + TestParamInfo(*param_it, i)); + + GTEST_CHECK_(IsValidParamName(param_name)) + << "Parameterized test name '" << param_name + << "' is invalid, in " << file + << " line " << line << std::endl; + + GTEST_CHECK_(test_param_names.count(param_name) == 0) + << "Duplicate parameterized test name '" << param_name + << "', in " << file << " line " << line << std::endl; + + test_param_names.insert(param_name); + + test_name_stream << test_info->test_base_name << "/" << param_name; + MakeAndRegisterTestInfo( + test_case_name.c_str(), + test_name_stream.GetString().c_str(), + NULL, // No type parameter. + PrintToString(*param_it).c_str(), + code_location_, + GetTestCaseTypeId(), + TestCase::SetUpTestCase, + TestCase::TearDownTestCase, + test_info->test_meta_factory->CreateTestFactory(*param_it)); + } // for param_it + } // for gen_it + } // for test_it + } // RegisterTests + + private: + // LocalTestInfo structure keeps information about a single test registered + // with TEST_P macro. + struct TestInfo { + TestInfo(const char* a_test_case_base_name, + const char* a_test_base_name, + TestMetaFactoryBase* a_test_meta_factory) : + test_case_base_name(a_test_case_base_name), + test_base_name(a_test_base_name), + test_meta_factory(a_test_meta_factory) {} + + const string test_case_base_name; + const string test_base_name; + const scoped_ptr > test_meta_factory; + }; + typedef ::std::vector > TestInfoContainer; + // Records data received from INSTANTIATE_TEST_CASE_P macros: + // + struct InstantiationInfo { + InstantiationInfo(const std::string &name_in, + GeneratorCreationFunc* generator_in, + ParamNameGeneratorFunc* name_func_in, + const char* file_in, + int line_in) + : name(name_in), + generator(generator_in), + name_func(name_func_in), + file(file_in), + line(line_in) {} + + std::string name; + GeneratorCreationFunc* generator; + ParamNameGeneratorFunc* name_func; + const char* file; + int line; + }; + typedef ::std::vector InstantiationContainer; + + static bool IsValidParamName(const std::string& name) { + // Check for empty string + if (name.empty()) + return false; + + // Check for invalid characters + for (std::string::size_type index = 0; index < name.size(); ++index) { + if (!isalnum(name[index]) && name[index] != '_') + return false; + } + + return true; + } + + const string test_case_name_; + CodeLocation code_location_; + TestInfoContainer tests_; + InstantiationContainer instantiations_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo); +}; // class ParameterizedTestCaseInfo + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase +// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P +// macros use it to locate their corresponding ParameterizedTestCaseInfo +// descriptors. +class ParameterizedTestCaseRegistry { + public: + ParameterizedTestCaseRegistry() {} + ~ParameterizedTestCaseRegistry() { + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + delete *it; + } + } + + // Looks up or creates and returns a structure containing information about + // tests and instantiations of a particular test case. + template + ParameterizedTestCaseInfo* GetTestCasePatternHolder( + const char* test_case_name, + CodeLocation code_location) { + ParameterizedTestCaseInfo* typed_test_info = NULL; + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + if ((*it)->GetTestCaseName() == test_case_name) { + if ((*it)->GetTestCaseTypeId() != GetTypeId()) { + // Complain about incorrect usage of Google Test facilities + // and terminate the program since we cannot guaranty correct + // test case setup and tear-down in this case. + ReportInvalidTestCaseType(test_case_name, code_location); + posix::Abort(); + } else { + // At this point we are sure that the object we found is of the same + // type we are looking for, so we downcast it to that type + // without further checks. + typed_test_info = CheckedDowncastToActualType< + ParameterizedTestCaseInfo >(*it); + } + break; + } + } + if (typed_test_info == NULL) { + typed_test_info = new ParameterizedTestCaseInfo( + test_case_name, code_location); + test_case_infos_.push_back(typed_test_info); + } + return typed_test_info; + } + void RegisterTests() { + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + (*it)->RegisterTests(); + } + } + + private: + typedef ::std::vector TestCaseInfoContainer; + + TestCaseInfoContainer test_case_infos_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry); +}; + +} // namespace internal +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h new file mode 100644 index 0000000000..74ab949057 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -0,0 +1,93 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the GTEST_OS_* macro. +// It is separate from gtest-port.h so that custom/gtest-port.h can include it. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ + +// Determines the platform on which Google Test is compiled. +#ifdef __CYGWIN__ +# define GTEST_OS_CYGWIN 1 +#elif defined __SYMBIAN32__ +# define GTEST_OS_SYMBIAN 1 +#elif defined _WIN32 +# define GTEST_OS_WINDOWS 1 +# ifdef _WIN32_WCE +# define GTEST_OS_WINDOWS_MOBILE 1 +# elif defined(__MINGW__) || defined(__MINGW32__) +# define GTEST_OS_WINDOWS_MINGW 1 +# elif defined(WINAPI_FAMILY) +# include +# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +# define GTEST_OS_WINDOWS_DESKTOP 1 +# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) +# define GTEST_OS_WINDOWS_PHONE 1 +# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +# define GTEST_OS_WINDOWS_RT 1 +# else + // WINAPI_FAMILY defined but no known partition matched. + // Default to desktop. +# define GTEST_OS_WINDOWS_DESKTOP 1 +# endif +# else +# define GTEST_OS_WINDOWS_DESKTOP 1 +# endif // _WIN32_WCE +#elif defined __APPLE__ +# define GTEST_OS_MAC 1 +# if TARGET_OS_IPHONE +# define GTEST_OS_IOS 1 +# endif +#elif defined __FreeBSD__ +# define GTEST_OS_FREEBSD 1 +#elif defined __linux__ +# define GTEST_OS_LINUX 1 +# if defined __ANDROID__ +# define GTEST_OS_LINUX_ANDROID 1 +# endif +#elif defined __MVS__ +# define GTEST_OS_ZOS 1 +#elif defined(__sun) && defined(__SVR4) +# define GTEST_OS_SOLARIS 1 +#elif defined(_AIX) +# define GTEST_OS_AIX 1 +#elif defined(__hpux) +# define GTEST_OS_HPUX 1 +#elif defined __native_client__ +# define GTEST_OS_NACL 1 +#elif defined __OpenBSD__ +# define GTEST_OS_OPENBSD 1 +#elif defined __QNX__ +# define GTEST_OS_QNX 1 +#endif // __CYGWIN__ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h new file mode 100644 index 0000000000..da57e65d33 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -0,0 +1,2567 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan) +// +// Low-level types and utilities for porting Google Test to various +// platforms. All macros ending with _ and symbols defined in an +// internal namespace are subject to change without notice. Code +// outside Google Test MUST NOT USE THEM DIRECTLY. Macros that don't +// end with _ are part of Google Test's public API and can be used by +// code outside Google Test. +// +// This file is fundamental to Google Test. All other Google Test source +// files are expected to #include this. Therefore, it cannot #include +// any other Google Test header. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +// Environment-describing macros +// ----------------------------- +// +// Google Test can be used in many different environments. Macros in +// this section tell Google Test what kind of environment it is being +// used in, such that Google Test can provide environment-specific +// features and implementations. +// +// Google Test tries to automatically detect the properties of its +// environment, so users usually don't need to worry about these +// macros. However, the automatic detection is not perfect. +// Sometimes it's necessary for a user to define some of the following +// macros in the build script to override Google Test's decisions. +// +// If the user doesn't define a macro in the list, Google Test will +// provide a default definition. After this header is #included, all +// macros in this list will be defined to either 1 or 0. +// +// Notes to maintainers: +// - Each macro here is a user-tweakable knob; do not grow the list +// lightly. +// - Use #if to key off these macros. Don't use #ifdef or "#if +// defined(...)", which will not work as these macros are ALWAYS +// defined. +// +// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) +// is/isn't available. +// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions +// are enabled. +// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string +// is/isn't available (some systems define +// ::string, which is different to std::string). +// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string +// is/isn't available (some systems define +// ::wstring, which is different to std::wstring). +// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular +// expressions are/aren't available. +// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that +// is/isn't available. +// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't +// enabled. +// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that +// std::wstring does/doesn't work (Google Test can +// be used where std::wstring is unavailable). +// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple +// is/isn't available. +// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the +// compiler supports Microsoft's "Structured +// Exception Handling". +// GTEST_HAS_STREAM_REDIRECTION +// - Define it to 1/0 to indicate whether the +// platform supports I/O stream redirection using +// dup() and dup2(). +// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google +// Test's own tr1 tuple implementation should be +// used. Unused when the user sets +// GTEST_HAS_TR1_TUPLE to 0. +// GTEST_LANG_CXX11 - Define it to 1/0 to indicate that Google Test +// is building in C++11/C++98 mode. +// GTEST_LINKED_AS_SHARED_LIBRARY +// - Define to 1 when compiling tests that use +// Google Test as a shared library (known as +// DLL on Windows). +// GTEST_CREATE_SHARED_LIBRARY +// - Define to 1 when compiling Google Test itself +// as a shared library. + +// Platform-indicating macros +// -------------------------- +// +// Macros indicating the platform on which Google Test is being used +// (a macro is defined to 1 if compiled on the given platform; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// GTEST_OS_AIX - IBM AIX +// GTEST_OS_CYGWIN - Cygwin +// GTEST_OS_FREEBSD - FreeBSD +// GTEST_OS_HPUX - HP-UX +// GTEST_OS_LINUX - Linux +// GTEST_OS_LINUX_ANDROID - Google Android +// GTEST_OS_MAC - Mac OS X +// GTEST_OS_IOS - iOS +// GTEST_OS_NACL - Google Native Client (NaCl) +// GTEST_OS_OPENBSD - OpenBSD +// GTEST_OS_QNX - QNX +// GTEST_OS_SOLARIS - Sun Solaris +// GTEST_OS_SYMBIAN - Symbian +// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) +// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop +// GTEST_OS_WINDOWS_MINGW - MinGW +// GTEST_OS_WINDOWS_MOBILE - Windows Mobile +// GTEST_OS_WINDOWS_PHONE - Windows Phone +// GTEST_OS_WINDOWS_RT - Windows Store App/WinRT +// GTEST_OS_ZOS - z/OS +// +// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the +// most stable support. Since core members of the Google Test project +// don't have access to other platforms, support for them may be less +// stable. If you notice any problems on your platform, please notify +// googletestframework@googlegroups.com (patches for fixing them are +// even more welcome!). +// +// It is possible that none of the GTEST_OS_* macros are defined. + +// Feature-indicating macros +// ------------------------- +// +// Macros indicating which Google Test features are available (a macro +// is defined to 1 if the corresponding feature is supported; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// These macros are public so that portable tests can be written. +// Such tests typically surround code using a feature with an #if +// which controls that code. For example: +// +// #if GTEST_HAS_DEATH_TEST +// EXPECT_DEATH(DoSomethingDeadly()); +// #endif +// +// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized +// tests) +// GTEST_HAS_DEATH_TEST - death tests +// GTEST_HAS_PARAM_TEST - value-parameterized tests +// GTEST_HAS_TYPED_TEST - typed tests +// GTEST_HAS_TYPED_TEST_P - type-parameterized tests +// GTEST_IS_THREADSAFE - Google Test is thread-safe. +// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with +// GTEST_HAS_POSIX_RE (see above) which users can +// define themselves. +// GTEST_USES_SIMPLE_RE - our own simple regex is used; +// the above two are mutually exclusive. +// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ(). + +// Misc public macros +// ------------------ +// +// GTEST_FLAG(flag_name) - references the variable corresponding to +// the given Google Test flag. + +// Internal utilities +// ------------------ +// +// The following macros and utilities are for Google Test's INTERNAL +// use only. Code outside Google Test MUST NOT USE THEM DIRECTLY. +// +// Macros for basic C++ coding: +// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. +// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a +// variable don't have to be used. +// GTEST_DISALLOW_ASSIGN_ - disables operator=. +// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. +// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. +// GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is +// suppressed (constant conditional). +// GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127 +// is suppressed. +// +// C++11 feature wrappers: +// +// testing::internal::move - portability wrapper for std::move. +// +// Synchronization: +// Mutex, MutexLock, ThreadLocal, GetThreadCount() +// - synchronization primitives. +// +// Template meta programming: +// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only. +// IteratorTraits - partial implementation of std::iterator_traits, which +// is not available in libCstd when compiled with Sun C++. +// +// Smart pointers: +// scoped_ptr - as in TR2. +// +// Regular expressions: +// RE - a simple regular expression class using the POSIX +// Extended Regular Expression syntax on UNIX-like +// platforms, or a reduced regular exception syntax on +// other platforms, including Windows. +// +// Logging: +// GTEST_LOG_() - logs messages at the specified severity level. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. +// +// Stdout and stderr capturing: +// CaptureStdout() - starts capturing stdout. +// GetCapturedStdout() - stops capturing stdout and returns the captured +// string. +// CaptureStderr() - starts capturing stderr. +// GetCapturedStderr() - stops capturing stderr and returns the captured +// string. +// +// Integer types: +// TypeWithSize - maps an integer to a int type. +// Int32, UInt32, Int64, UInt64, TimeInMillis +// - integers of known sizes. +// BiggestInt - the biggest signed integer type. +// +// Command-line utilities: +// GTEST_DECLARE_*() - declares a flag. +// GTEST_DEFINE_*() - defines a flag. +// GetInjectableArgvs() - returns the command line as a vector of strings. +// +// Environment variable utilities: +// GetEnv() - gets the value of an environment variable. +// BoolFromGTestEnv() - parses a bool environment variable. +// Int32FromGTestEnv() - parses an Int32 environment variable. +// StringFromGTestEnv() - parses a string environment variable. + +#include // for isspace, etc +#include // for ptrdiff_t +#include +#include +#include +#ifndef _WIN32_WCE +# include +# include +#endif // !_WIN32_WCE + +#if defined __APPLE__ +# include +# include +#endif + +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT + +#include "gtest/internal/gtest-port-arch.h" +#include "gtest/internal/custom/gtest-port.h" + +#if !defined(GTEST_DEV_EMAIL_) +# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +# define GTEST_FLAG_PREFIX_ "gtest_" +# define GTEST_FLAG_PREFIX_DASH_ "gtest-" +# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +# define GTEST_NAME_ "Google Test" +# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" +#endif // !defined(GTEST_DEV_EMAIL_) + +#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_) +# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" +#endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_) + +// Determines the version of gcc that is used to compile this. +#ifdef __GNUC__ +// 40302 means version 4.3.2. +# define GTEST_GCC_VER_ \ + (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) +#endif // __GNUC__ + +// Macros for disabling Microsoft Visual C++ warnings. +// +// GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385) +// /* code that triggers warnings C4800 and C4385 */ +// GTEST_DISABLE_MSC_WARNINGS_POP_() +#if _MSC_VER >= 1500 +# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ + __pragma(warning(push)) \ + __pragma(warning(disable: warnings)) +# define GTEST_DISABLE_MSC_WARNINGS_POP_() \ + __pragma(warning(pop)) +#else +// Older versions of MSVC don't have __pragma. +# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) +# define GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +#ifndef GTEST_LANG_CXX11 +// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when +// -std={c,gnu}++{0x,11} is passed. The C++11 standard specifies a +// value for __cplusplus, and recent versions of clang, gcc, and +// probably other compilers set that too in C++11 mode. +# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L +// Compiling in at least C++11 mode. +# define GTEST_LANG_CXX11 1 +# else +# define GTEST_LANG_CXX11 0 +# endif +#endif + +// Distinct from C++11 language support, some environments don't provide +// proper C++11 library support. Notably, it's possible to build in +// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++ +// with no C++11 support. +// +// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__ +// 20110325, but maintenance releases in the 4.4 and 4.5 series followed +// this date, so check for those versions by their date stamps. +// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning +#if GTEST_LANG_CXX11 && \ + (!defined(__GLIBCXX__) || ( \ + __GLIBCXX__ >= 20110325ul && /* GCC >= 4.6.0 */ \ + /* Blacklist of patch releases of older branches: */ \ + __GLIBCXX__ != 20110416ul && /* GCC 4.4.6 */ \ + __GLIBCXX__ != 20120313ul && /* GCC 4.4.7 */ \ + __GLIBCXX__ != 20110428ul && /* GCC 4.5.3 */ \ + __GLIBCXX__ != 20120702ul)) /* GCC 4.5.4 */ +# define GTEST_STDLIB_CXX11 1 +#endif + +// Only use C++11 library features if the library provides them. +#if GTEST_STDLIB_CXX11 +# define GTEST_HAS_STD_BEGIN_AND_END_ 1 +# define GTEST_HAS_STD_FORWARD_LIST_ 1 +# define GTEST_HAS_STD_FUNCTION_ 1 +# define GTEST_HAS_STD_INITIALIZER_LIST_ 1 +# define GTEST_HAS_STD_MOVE_ 1 +# define GTEST_HAS_STD_SHARED_PTR_ 1 +# define GTEST_HAS_STD_TYPE_TRAITS_ 1 +# define GTEST_HAS_STD_UNIQUE_PTR_ 1 +#endif + +// C++11 specifies that provides std::tuple. +// Some platforms still might not have it, however. +#if GTEST_LANG_CXX11 +# define GTEST_HAS_STD_TUPLE_ 1 +# if defined(__clang__) +// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include +# if defined(__has_include) && !__has_include() +# undef GTEST_HAS_STD_TUPLE_ +# endif +# elif defined(_MSC_VER) +// Inspired by boost/config/stdlib/dinkumware.hpp +# if defined(_CPPLIB_VER) && _CPPLIB_VER < 520 +# undef GTEST_HAS_STD_TUPLE_ +# endif +# elif defined(__GLIBCXX__) +// Inspired by boost/config/stdlib/libstdcpp3.hpp, +// http://gcc.gnu.org/gcc-4.2/changes.html and +// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x +# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2) +# undef GTEST_HAS_STD_TUPLE_ +# endif +# endif +#endif + +// Brings in definitions for functions used in the testing::internal::posix +// namespace (read, write, close, chdir, isatty, stat). We do not currently +// use them on Windows Mobile. +#if GTEST_OS_WINDOWS +# if !GTEST_OS_WINDOWS_MOBILE +# include +# include +# endif +// In order to avoid having to include , use forward declaration +// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION. +// This assumption is verified by +// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION. +struct _RTL_CRITICAL_SECTION; +#else +// This assumes that non-Windows OSes provide unistd.h. For OSes where this +// is not the case, we need to include headers that provide the functions +// mentioned above. +# include +# include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_LINUX_ANDROID +// Used to define __ANDROID_API__ matching the target NDK API level. +# include // NOLINT +#endif + +// Defines this to true iff Google Test can use POSIX regular expressions. +#ifndef GTEST_HAS_POSIX_RE +# if GTEST_OS_LINUX_ANDROID +// On Android, is only available starting with Gingerbread. +# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) +# else +# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS) +# endif +#endif + +#if GTEST_USES_PCRE +// The appropriate headers have already been included. + +#elif GTEST_HAS_POSIX_RE + +// On some platforms, needs someone to define size_t, and +// won't compile otherwise. We can #include it here as we already +// included , which is guaranteed to define size_t through +// . +# include // NOLINT + +# define GTEST_USES_POSIX_RE 1 + +#elif GTEST_OS_WINDOWS + +// is not available on Windows. Use our own simple regex +// implementation instead. +# define GTEST_USES_SIMPLE_RE 1 + +#else + +// may not be available on this platform. Use our own +// simple regex implementation instead. +# define GTEST_USES_SIMPLE_RE 1 + +#endif // GTEST_USES_PCRE + +#ifndef GTEST_HAS_EXCEPTIONS +// The user didn't tell us whether exceptions are enabled, so we need +// to figure it out. +# if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS +// macro to enable exceptions, so we'll do the same. +// Assumes that exceptions are enabled by default. +# ifndef _HAS_EXCEPTIONS +# define _HAS_EXCEPTIONS 1 +# endif // _HAS_EXCEPTIONS +# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +# elif defined(__clang__) +// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714, +// but iff cleanups are enabled after that. In Obj-C++ files, there can be +// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions +// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++ +// exceptions starting at clang r206352, but which checked for cleanups prior to +// that. To reliably check for C++ exception availability with clang, check for +// __EXCEPTIONS && __has_feature(cxx_exceptions). +# define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) +# elif defined(__GNUC__) && __EXCEPTIONS +// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__SUNPRO_CC) +// Sun Pro CC supports exceptions. However, there is no compile-time way of +// detecting whether they are enabled or not. Therefore, we assume that +// they are enabled unless the user tells us otherwise. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__IBMCPP__) && __EXCEPTIONS +// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__HP_aCC) +// Exception handling is in effect by default in HP aCC compiler. It has to +// be turned of by +noeh compiler option if desired. +# define GTEST_HAS_EXCEPTIONS 1 +# else +// For other compilers, we assume exceptions are disabled to be +// conservative. +# define GTEST_HAS_EXCEPTIONS 0 +# endif // defined(_MSC_VER) || defined(__BORLANDC__) +#endif // GTEST_HAS_EXCEPTIONS + +#if !defined(GTEST_HAS_STD_STRING) +// Even though we don't use this macro any longer, we keep it in case +// some clients still depend on it. +# define GTEST_HAS_STD_STRING 1 +#elif !GTEST_HAS_STD_STRING +// The user told us that ::std::string isn't available. +# error "Google Test cannot be used where ::std::string isn't available." +#endif // !defined(GTEST_HAS_STD_STRING) + +#ifndef GTEST_HAS_GLOBAL_STRING +// The user didn't tell us whether ::string is available, so we need +// to figure it out. + +# define GTEST_HAS_GLOBAL_STRING 0 + +#endif // GTEST_HAS_GLOBAL_STRING + +#ifndef GTEST_HAS_STD_WSTRING +// The user didn't tell us whether ::std::wstring is available, so we need +// to figure it out. +// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring +// is available. + +// Cygwin 1.7 and below doesn't support ::std::wstring. +// Solaris' libc++ doesn't support it either. Android has +// no support for it at least as recent as Froyo (2.2). +# define GTEST_HAS_STD_WSTRING \ + (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS)) + +#endif // GTEST_HAS_STD_WSTRING + +#ifndef GTEST_HAS_GLOBAL_WSTRING +// The user didn't tell us whether ::wstring is available, so we need +// to figure it out. +# define GTEST_HAS_GLOBAL_WSTRING \ + (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING) +#endif // GTEST_HAS_GLOBAL_WSTRING + +// Determines whether RTTI is available. +#ifndef GTEST_HAS_RTTI +// The user didn't tell us whether RTTI is enabled, so we need to +// figure it out. + +# ifdef _MSC_VER + +# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled. +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif + +// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled. +# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302) + +# ifdef __GXX_RTTI +// When building against STLport with the Android NDK and with +// -frtti -fno-exceptions, the build fails at link time with undefined +// references to __cxa_bad_typeid. Note sure if STL or toolchain bug, +// so disable RTTI when detected. +# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \ + !defined(__EXCEPTIONS) +# define GTEST_HAS_RTTI 0 +# else +# define GTEST_HAS_RTTI 1 +# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS +# else +# define GTEST_HAS_RTTI 0 +# endif // __GXX_RTTI + +// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends +// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the +// first version with C++ support. +# elif defined(__clang__) + +# define GTEST_HAS_RTTI __has_feature(cxx_rtti) + +// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if +// both the typeid and dynamic_cast features are present. +# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) + +# ifdef __RTTI_ALL__ +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif + +# else + +// For all other compilers, we assume RTTI is enabled. +# define GTEST_HAS_RTTI 1 + +# endif // _MSC_VER + +#endif // GTEST_HAS_RTTI + +// It's this header's responsibility to #include when RTTI +// is enabled. +#if GTEST_HAS_RTTI +# include +#endif + +// Determines whether Google Test can use the pthreads library. +#ifndef GTEST_HAS_PTHREAD +// The user didn't tell us explicitly, so we make reasonable assumptions about +// which platforms have pthreads support. +// +// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 +// to your compiler flags. +# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \ + || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL) +#endif // GTEST_HAS_PTHREAD + +#if GTEST_HAS_PTHREAD +// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is +// true. +# include // NOLINT + +// For timespec and nanosleep, used below. +# include // NOLINT +#endif + +// Determines if hash_map/hash_set are available. +// Only used for testing against those containers. +#if !defined(GTEST_HAS_HASH_MAP_) +# if _MSC_VER +# define GTEST_HAS_HASH_MAP_ 1 // Indicates that hash_map is available. +# define GTEST_HAS_HASH_SET_ 1 // Indicates that hash_set is available. +# endif // _MSC_VER +#endif // !defined(GTEST_HAS_HASH_MAP_) + +// Determines whether Google Test can use tr1/tuple. You can define +// this macro to 0 to prevent Google Test from using tuple (any +// feature depending on tuple with be disabled in this mode). +#ifndef GTEST_HAS_TR1_TUPLE +# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) +// STLport, provided with the Android NDK, has neither or . +# define GTEST_HAS_TR1_TUPLE 0 +# else +// The user didn't tell us not to do it, so we assume it's OK. +# define GTEST_HAS_TR1_TUPLE 1 +# endif +#endif // GTEST_HAS_TR1_TUPLE + +// Determines whether Google Test's own tr1 tuple implementation +// should be used. +#ifndef GTEST_USE_OWN_TR1_TUPLE +// The user didn't tell us, so we need to figure it out. + +// We use our own TR1 tuple if we aren't sure the user has an +// implementation of it already. At this time, libstdc++ 4.0.0+ and +// MSVC 2010 are the only mainstream standard libraries that come +// with a TR1 tuple implementation. NVIDIA's CUDA NVCC compiler +// pretends to be GCC by defining __GNUC__ and friends, but cannot +// compile GCC's tuple implementation. MSVC 2008 (9.0) provides TR1 +// tuple in a 323 MB Feature Pack download, which we cannot assume the +// user has. QNX's QCC compiler is a modified GCC but it doesn't +// support TR1 tuple. libc++ only provides std::tuple, in C++11 mode, +// and it can be used with some compilers that define __GNUC__. +# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \ + && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600 +# define GTEST_ENV_HAS_TR1_TUPLE_ 1 +# endif + +// C++11 specifies that provides std::tuple. Use that if gtest is used +// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6 +// can build with clang but need to use gcc4.2's libstdc++). +# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325) +# define GTEST_ENV_HAS_STD_TUPLE_ 1 +# endif + +# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_ +# define GTEST_USE_OWN_TR1_TUPLE 0 +# else +# define GTEST_USE_OWN_TR1_TUPLE 1 +# endif + +#endif // GTEST_USE_OWN_TR1_TUPLE + +// To avoid conditional compilation everywhere, we make it +// gtest-port.h's responsibility to #include the header implementing +// tuple. +#if GTEST_HAS_STD_TUPLE_ +# include // IWYU pragma: export +# define GTEST_TUPLE_NAMESPACE_ ::std +#endif // GTEST_HAS_STD_TUPLE_ + +// We include tr1::tuple even if std::tuple is available to define printers for +// them. +#if GTEST_HAS_TR1_TUPLE +# ifndef GTEST_TUPLE_NAMESPACE_ +# define GTEST_TUPLE_NAMESPACE_ ::std::tr1 +# endif // GTEST_TUPLE_NAMESPACE_ + +# if GTEST_USE_OWN_TR1_TUPLE +# include "gtest/internal/gtest-tuple.h" // IWYU pragma: export // NOLINT +# elif GTEST_ENV_HAS_STD_TUPLE_ +# include +// C++11 puts its tuple into the ::std namespace rather than +// ::std::tr1. gtest expects tuple to live in ::std::tr1, so put it there. +// This causes undefined behavior, but supported compilers react in +// the way we intend. +namespace std { +namespace tr1 { +using ::std::get; +using ::std::make_tuple; +using ::std::tuple; +using ::std::tuple_element; +using ::std::tuple_size; +} +} + +# elif GTEST_OS_SYMBIAN + +// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to +// use STLport's tuple implementation, which unfortunately doesn't +// work as the copy of STLport distributed with Symbian is incomplete. +// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to +// use its own tuple implementation. +# ifdef BOOST_HAS_TR1_TUPLE +# undef BOOST_HAS_TR1_TUPLE +# endif // BOOST_HAS_TR1_TUPLE + +// This prevents , which defines +// BOOST_HAS_TR1_TUPLE, from being #included by Boost's . +# define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED +# include // IWYU pragma: export // NOLINT + +# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000) +// GCC 4.0+ implements tr1/tuple in the header. This does +// not conform to the TR1 spec, which requires the header to be . + +# if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 +// Until version 4.3.2, gcc has a bug that causes , +// which is #included by , to not compile when RTTI is +// disabled. _TR1_FUNCTIONAL is the header guard for +// . Hence the following #define is a hack to prevent +// from being included. +# define _TR1_FUNCTIONAL 1 +# include +# undef _TR1_FUNCTIONAL // Allows the user to #include + // if he chooses to. +# else +# include // NOLINT +# endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 + +# else +// If the compiler is not GCC 4.0+, we assume the user is using a +// spec-conforming TR1 implementation. +# include // IWYU pragma: export // NOLINT +# endif // GTEST_USE_OWN_TR1_TUPLE + +#endif // GTEST_HAS_TR1_TUPLE + +// Determines whether clone(2) is supported. +// Usually it will only be available on Linux, excluding +// Linux on the Itanium architecture. +// Also see http://linux.die.net/man/2/clone. +#ifndef GTEST_HAS_CLONE +// The user didn't tell us, so we need to figure it out. + +# if GTEST_OS_LINUX && !defined(__ia64__) +# if GTEST_OS_LINUX_ANDROID +// On Android, clone() is only available on ARM starting with Gingerbread. +# if defined(__arm__) && __ANDROID_API__ >= 9 +# define GTEST_HAS_CLONE 1 +# else +# define GTEST_HAS_CLONE 0 +# endif +# else +# define GTEST_HAS_CLONE 1 +# endif +# else +# define GTEST_HAS_CLONE 0 +# endif // GTEST_OS_LINUX && !defined(__ia64__) + +#endif // GTEST_HAS_CLONE + +// Determines whether to support stream redirection. This is used to test +// output correctness and to implement death tests. +#ifndef GTEST_HAS_STREAM_REDIRECTION +// By default, we assume that stream redirection is supported on all +// platforms except known mobile ones. +# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \ + GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT +# define GTEST_HAS_STREAM_REDIRECTION 0 +# else +# define GTEST_HAS_STREAM_REDIRECTION 1 +# endif // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN +#endif // GTEST_HAS_STREAM_REDIRECTION + +// Determines whether to support death tests. +// Google Test does not support death tests for VC 7.1 and earlier as +// abort() in a VC 7.1 application compiled as GUI in debug config +// pops up a dialog window that cannot be suppressed programmatically. +#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_MAC && !GTEST_OS_IOS) || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ + GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \ + GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD) +# define GTEST_HAS_DEATH_TEST 1 +#endif + +// We don't support MSVC 7.1 with exceptions disabled now. Therefore +// all the compilers we care about are adequate for supporting +// value-parameterized tests. +#define GTEST_HAS_PARAM_TEST 1 + +// Determines whether to support type-driven tests. + +// Typed tests need and variadic macros, which GCC, VC++ 8.0, +// Sun Pro CC, IBM Visual Age, and HP aCC support. +#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \ + defined(__IBMCPP__) || defined(__HP_aCC) +# define GTEST_HAS_TYPED_TEST 1 +# define GTEST_HAS_TYPED_TEST_P 1 +#endif + +// Determines whether to support Combine(). This only makes sense when +// value-parameterized tests are enabled. The implementation doesn't +// work on Sun Studio since it doesn't understand templated conversion +// operators. +#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC) +# define GTEST_HAS_COMBINE 1 +#endif + +// Determines whether the system compiler uses UTF-16 for encoding wide strings. +#define GTEST_WIDE_STRING_USES_UTF16_ \ + (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX) + +// Determines whether test results can be streamed to a socket. +#if GTEST_OS_LINUX +# define GTEST_CAN_STREAM_RESULTS_ 1 +#endif + +// Defines some utility macros. + +// The GNU compiler emits a warning if nested "if" statements are followed by +// an "else" statement and braces are not used to explicitly disambiguate the +// "else" binding. This leads to problems with code like: +// +// if (gate) +// ASSERT_*(condition) << "Some message"; +// +// The "switch (0) case 0:" idiom is used to suppress this. +#ifdef __INTEL_COMPILER +# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#else +# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT +#endif + +// Use this annotation at the end of a struct/class definition to +// prevent the compiler from optimizing away instances that are never +// used. This is useful when all interesting logic happens inside the +// c'tor and / or d'tor. Example: +// +// struct Foo { +// Foo() { ... } +// } GTEST_ATTRIBUTE_UNUSED_; +// +// Also use it after a variable or parameter declaration to tell the +// compiler the variable/parameter does not have to be used. +#if defined(__GNUC__) && !defined(COMPILER_ICC) +# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) +#elif defined(__clang__) +# if __has_attribute(unused) +# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) +# endif +#endif +#ifndef GTEST_ATTRIBUTE_UNUSED_ +# define GTEST_ATTRIBUTE_UNUSED_ +#endif + +// A macro to disallow operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_ASSIGN_(type)\ + void operator=(type const &) + +// A macro to disallow copy constructor and operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\ + type(type const &);\ + GTEST_DISALLOW_ASSIGN_(type) + +// Tell the compiler to warn about unused return values for functions declared +// with this macro. The macro should be used on function declarations +// following the argument list: +// +// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; +#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC) +# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) +#else +# define GTEST_MUST_USE_RESULT_ +#endif // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC + +// MS C++ compiler emits warning when a conditional expression is compile time +// constant. In some contexts this warning is false positive and needs to be +// suppressed. Use the following two macros in such cases: +// +// GTEST_INTENTIONAL_CONST_COND_PUSH_() +// while (true) { +// GTEST_INTENTIONAL_CONST_COND_POP_() +// } +# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) +# define GTEST_INTENTIONAL_CONST_COND_POP_() \ + GTEST_DISABLE_MSC_WARNINGS_POP_() + +// Determine whether the compiler supports Microsoft's Structured Exception +// Handling. This is supported by several Windows compilers but generally +// does not exist on any other system. +#ifndef GTEST_HAS_SEH +// The user didn't tell us, so we need to figure it out. + +# if defined(_MSC_VER) || defined(__BORLANDC__) +// These two compilers are known to support SEH. +# define GTEST_HAS_SEH 1 +# else +// Assume no SEH. +# define GTEST_HAS_SEH 0 +# endif + +#define GTEST_IS_THREADSAFE \ + (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \ + || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \ + || GTEST_HAS_PTHREAD) + +#endif // GTEST_HAS_SEH + +#ifdef _MSC_VER +# if GTEST_LINKED_AS_SHARED_LIBRARY +# define GTEST_API_ __declspec(dllimport) +# elif GTEST_CREATE_SHARED_LIBRARY +# define GTEST_API_ __declspec(dllexport) +# endif +#elif __GNUC__ >= 4 || defined(__clang__) +# define GTEST_API_ __attribute__((visibility ("default"))) +#endif // _MSC_VER + +#ifndef GTEST_API_ +# define GTEST_API_ +#endif + +#ifdef __GNUC__ +// Ask the compiler to never inline a given function. +# define GTEST_NO_INLINE_ __attribute__((noinline)) +#else +# define GTEST_NO_INLINE_ +#endif + +// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. +#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) +# define GTEST_HAS_CXXABI_H_ 1 +#else +# define GTEST_HAS_CXXABI_H_ 0 +#endif + +// A function level attribute to disable checking for use of uninitialized +// memory when built with MemorySanitizer. +#if defined(__clang__) +# if __has_feature(memory_sanitizer) +# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \ + __attribute__((no_sanitize_memory)) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +# endif // __has_feature(memory_sanitizer) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __clang__ + +// A function level attribute to disable AddressSanitizer instrumentation. +#if defined(__clang__) +# if __has_feature(address_sanitizer) +# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ + __attribute__((no_sanitize_address)) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +# endif // __has_feature(address_sanitizer) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __clang__ + +// A function level attribute to disable ThreadSanitizer instrumentation. +#if defined(__clang__) +# if __has_feature(thread_sanitizer) +# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \ + __attribute__((no_sanitize_thread)) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +# endif // __has_feature(thread_sanitizer) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __clang__ + +// A function level attribute to disable UndefinedBehaviorSanitizer's (defined) +// unsigned integer overflow instrumentation. +#if defined(__clang__) +# if defined(__has_attribute) && __has_attribute(no_sanitize) +# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +# else +# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ +# endif // defined(__has_attribute) && __has_attribute(no_sanitize) +#else +# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ +#endif // __clang__ + +namespace testing { + +class Message; + +#if defined(GTEST_TUPLE_NAMESPACE_) +// Import tuple and friends into the ::testing namespace. +// It is part of our interface, having them in ::testing allows us to change +// their types as needed. +using GTEST_TUPLE_NAMESPACE_::get; +using GTEST_TUPLE_NAMESPACE_::make_tuple; +using GTEST_TUPLE_NAMESPACE_::tuple; +using GTEST_TUPLE_NAMESPACE_::tuple_size; +using GTEST_TUPLE_NAMESPACE_::tuple_element; +#endif // defined(GTEST_TUPLE_NAMESPACE_) + +namespace internal { + +// A secret type that Google Test users don't know about. It has no +// definition on purpose. Therefore it's impossible to create a +// Secret object, which is what we want. +class Secret; + +// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES, +// names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +#if GTEST_LANG_CXX11 +# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg) +#else // !GTEST_LANG_CXX11 +template + struct CompileAssert { +}; + +# define GTEST_COMPILE_ASSERT_(expr, msg) \ + typedef ::testing::internal::CompileAssert<(static_cast(expr))> \ + msg[static_cast(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_ +#endif // !GTEST_LANG_CXX11 + +// Implementation details of GTEST_COMPILE_ASSERT_: +// +// (In C++11, we simply use static_assert instead of the following) +// +// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type CompileAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outter parentheses in CompileAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// CompileAssert +// +// instead, these compilers will refuse to compile +// +// GTEST_COMPILE_ASSERT_(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. + +// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h. +// +// This template is declared, but intentionally undefined. +template +struct StaticAssertTypeEqHelper; + +template +struct StaticAssertTypeEqHelper { + enum { value = true }; +}; + +// Evaluates to the number of elements in 'array'. +#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0])) + +#if GTEST_HAS_GLOBAL_STRING +typedef ::string string; +#else +typedef ::std::string string; +#endif // GTEST_HAS_GLOBAL_STRING + +#if GTEST_HAS_GLOBAL_WSTRING +typedef ::wstring wstring; +#elif GTEST_HAS_STD_WSTRING +typedef ::std::wstring wstring; +#endif // GTEST_HAS_GLOBAL_WSTRING + +// A helper for suppressing warnings on constant condition. It just +// returns 'condition'. +GTEST_API_ bool IsTrue(bool condition); + +// Defines scoped_ptr. + +// This implementation of scoped_ptr is PARTIAL - it only contains +// enough stuff to satisfy Google Test's need. +template +class scoped_ptr { + public: + typedef T element_type; + + explicit scoped_ptr(T* p = NULL) : ptr_(p) {} + ~scoped_ptr() { reset(); } + + T& operator*() const { return *ptr_; } + T* operator->() const { return ptr_; } + T* get() const { return ptr_; } + + T* release() { + T* const ptr = ptr_; + ptr_ = NULL; + return ptr; + } + + void reset(T* p = NULL) { + if (p != ptr_) { + if (IsTrue(sizeof(T) > 0)) { // Makes sure T is a complete type. + delete ptr_; + } + ptr_ = p; + } + } + + friend void swap(scoped_ptr& a, scoped_ptr& b) { + using std::swap; + swap(a.ptr_, b.ptr_); + } + + private: + T* ptr_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr); +}; + +// Defines RE. + +// A simple C++ wrapper for . It uses the POSIX Extended +// Regular Expression syntax. +class GTEST_API_ RE { + public: + // A copy constructor is required by the Standard to initialize object + // references from r-values. + RE(const RE& other) { Init(other.pattern()); } + + // Constructs an RE from a string. + RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT + +#if GTEST_HAS_GLOBAL_STRING + + RE(const ::string& regex) { Init(regex.c_str()); } // NOLINT + +#endif // GTEST_HAS_GLOBAL_STRING + + RE(const char* regex) { Init(regex); } // NOLINT + ~RE(); + + // Returns the string representation of the regex. + const char* pattern() const { return pattern_; } + + // FullMatch(str, re) returns true iff regular expression re matches + // the entire str. + // PartialMatch(str, re) returns true iff regular expression re + // matches a substring of str (including str itself). + // + // TODO(wan@google.com): make FullMatch() and PartialMatch() work + // when str contains NUL characters. + static bool FullMatch(const ::std::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::std::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + +#if GTEST_HAS_GLOBAL_STRING + + static bool FullMatch(const ::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + +#endif // GTEST_HAS_GLOBAL_STRING + + static bool FullMatch(const char* str, const RE& re); + static bool PartialMatch(const char* str, const RE& re); + + private: + void Init(const char* regex); + + // We use a const char* instead of an std::string, as Google Test used to be + // used where std::string is not available. TODO(wan@google.com): change to + // std::string. + const char* pattern_; + bool is_valid_; + +#if GTEST_USES_POSIX_RE + + regex_t full_regex_; // For FullMatch(). + regex_t partial_regex_; // For PartialMatch(). + +#else // GTEST_USES_SIMPLE_RE + + const char* full_pattern_; // For FullMatch(); + +#endif + + GTEST_DISALLOW_ASSIGN_(RE); +}; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line); + +// Defines logging utilities: +// GTEST_LOG_(severity) - logs messages at the specified severity level. The +// message itself is streamed into the macro. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. + +enum GTestLogSeverity { + GTEST_INFO, + GTEST_WARNING, + GTEST_ERROR, + GTEST_FATAL +}; + +// Formats log entry severity, provides a stream object for streaming the +// log message, and terminates the message with a newline when going out of +// scope. +class GTEST_API_ GTestLog { + public: + GTestLog(GTestLogSeverity severity, const char* file, int line); + + // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. + ~GTestLog(); + + ::std::ostream& GetStream() { return ::std::cerr; } + + private: + const GTestLogSeverity severity_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); +}; + +#if !defined(GTEST_LOG_) + +# define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__).GetStream() + +inline void LogToStderr() {} +inline void FlushInfoLog() { fflush(NULL); } + +#endif // !defined(GTEST_LOG_) + +#if !defined(GTEST_CHECK_) +// INTERNAL IMPLEMENTATION - DO NOT USE. +// +// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition +// is not satisfied. +// Synopsys: +// GTEST_CHECK_(boolean_condition); +// or +// GTEST_CHECK_(boolean_condition) << "Additional message"; +// +// This checks the condition and if the condition is not satisfied +// it prints message about the condition violation, including the +// condition itself, plus additional message streamed into it, if any, +// and then it aborts the program. It aborts the program irrespective of +// whether it is built in the debug mode or not. +# define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " +#endif // !defined(GTEST_CHECK_) + +// An all-mode assert to verify that the given POSIX-style function +// call returns 0 (indicating success). Known limitation: this +// doesn't expand to a balanced 'if' statement, so enclose the macro +// in {} if you need to use it as the only statement in an 'if' +// branch. +#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ + << gtest_error + +#if GTEST_HAS_STD_MOVE_ +using std::move; +#else // GTEST_HAS_STD_MOVE_ +template +const T& move(const T& t) { + return t; +} +#endif // GTEST_HAS_STD_MOVE_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Use ImplicitCast_ as a safe version of static_cast for upcasting in +// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a +// const Foo*). When you use ImplicitCast_, the compiler checks that +// the cast is safe. Such explicit ImplicitCast_s are necessary in +// surprisingly many situations where C++ demands an exact type match +// instead of an argument type convertable to a target type. +// +// The syntax for using ImplicitCast_ is the same as for static_cast: +// +// ImplicitCast_(expr) +// +// ImplicitCast_ would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., implicit_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template +inline To ImplicitCast_(To x) { return x; } + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., down_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template // use like this: DownCast_(foo); +inline To DownCast_(From* f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (false) { + GTEST_INTENTIONAL_CONST_COND_POP_() + const To to = NULL; + ::testing::internal::ImplicitCast_(to); + } + +#if GTEST_HAS_RTTI + // RTTI: debug mode only! + GTEST_CHECK_(f == NULL || dynamic_cast(f) != NULL); +#endif + return static_cast(f); +} + +// Downcasts the pointer of type Base to Derived. +// Derived must be a subclass of Base. The parameter MUST +// point to a class of type Derived, not any subclass of it. +// When RTTI is available, the function performs a runtime +// check to enforce this. +template +Derived* CheckedDowncastToActualType(Base* base) { +#if GTEST_HAS_RTTI + GTEST_CHECK_(typeid(*base) == typeid(Derived)); +#endif + +#if GTEST_HAS_DOWNCAST_ + return ::down_cast(base); +#elif GTEST_HAS_RTTI + return dynamic_cast(base); // NOLINT +#else + return static_cast(base); // Poor man's downcast. +#endif +} + +#if GTEST_HAS_STREAM_REDIRECTION + +// Defines the stderr capturer: +// CaptureStdout - starts capturing stdout. +// GetCapturedStdout - stops capturing stdout and returns the captured string. +// CaptureStderr - starts capturing stderr. +// GetCapturedStderr - stops capturing stderr and returns the captured string. +// +GTEST_API_ void CaptureStdout(); +GTEST_API_ std::string GetCapturedStdout(); +GTEST_API_ void CaptureStderr(); +GTEST_API_ std::string GetCapturedStderr(); + +#endif // GTEST_HAS_STREAM_REDIRECTION + +// Returns a path to temporary directory. +GTEST_API_ std::string TempDir(); + +// Returns the size (in bytes) of a file. +GTEST_API_ size_t GetFileSize(FILE* file); + +// Reads the entire content of a file as a string. +GTEST_API_ std::string ReadEntireFile(FILE* file); + +// All command line arguments. +GTEST_API_ const ::std::vector& GetArgvs(); + +#if GTEST_HAS_DEATH_TEST + +const ::std::vector& GetInjectableArgvs(); +void SetInjectableArgvs(const ::std::vector* + new_argvs); + + +#endif // GTEST_HAS_DEATH_TEST + +// Defines synchronization primitives. +#if GTEST_IS_THREADSAFE +# if GTEST_HAS_PTHREAD +// Sleeps for (roughly) n milliseconds. This function is only for testing +// Google Test's own constructs. Don't use it in user tests, either +// directly or indirectly. +inline void SleepMilliseconds(int n) { + const timespec time = { + 0, // 0 seconds. + n * 1000L * 1000L, // And n ms. + }; + nanosleep(&time, NULL); +} +# endif // GTEST_HAS_PTHREAD + +# if GTEST_HAS_NOTIFICATION_ +// Notification has already been imported into the namespace. +// Nothing to do here. + +# elif GTEST_HAS_PTHREAD +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +class Notification { + public: + Notification() : notified_(false) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + } + ~Notification() { + pthread_mutex_destroy(&mutex_); + } + + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { + pthread_mutex_lock(&mutex_); + notified_ = true; + pthread_mutex_unlock(&mutex_); + } + + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + for (;;) { + pthread_mutex_lock(&mutex_); + const bool notified = notified_; + pthread_mutex_unlock(&mutex_); + if (notified) + break; + SleepMilliseconds(10); + } + } + + private: + pthread_mutex_t mutex_; + bool notified_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); +}; + +# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +GTEST_API_ void SleepMilliseconds(int n); + +// Provides leak-safe Windows kernel handle ownership. +// Used in death tests and in threading support. +class GTEST_API_ AutoHandle { + public: + // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to + // avoid including in this header file. Including is + // undesirable because it defines a lot of symbols and macros that tend to + // conflict with client code. This assumption is verified by + // WindowsTypesTest.HANDLEIsVoidStar. + typedef void* Handle; + AutoHandle(); + explicit AutoHandle(Handle handle); + + ~AutoHandle(); + + Handle Get() const; + void Reset(); + void Reset(Handle handle); + + private: + // Returns true iff the handle is a valid handle object that can be closed. + bool IsCloseable() const; + + Handle handle_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); +}; + +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +class GTEST_API_ Notification { + public: + Notification(); + void Notify(); + void WaitForNotification(); + + private: + AutoHandle event_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); +}; +# endif // GTEST_HAS_NOTIFICATION_ + +// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD +// defined, but we don't want to use MinGW's pthreads implementation, which +// has conformance problems with some versions of the POSIX standard. +# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW + +// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. +// Consequently, it cannot select a correct instantiation of ThreadWithParam +// in order to call its Run(). Introducing ThreadWithParamBase as a +// non-templated base class for ThreadWithParam allows us to bypass this +// problem. +class ThreadWithParamBase { + public: + virtual ~ThreadWithParamBase() {} + virtual void Run() = 0; +}; + +// pthread_create() accepts a pointer to a function type with the C linkage. +// According to the Standard (7.5/1), function types with different linkages +// are different even if they are otherwise identical. Some compilers (for +// example, SunStudio) treat them as different types. Since class methods +// cannot be defined with C-linkage we need to define a free C-function to +// pass into pthread_create(). +extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { + static_cast(thread)->Run(); + return NULL; +} + +// Helper class for testing Google Test's multi-threading constructs. +// To use it, write: +// +// void ThreadFunc(int param) { /* Do things with param */ } +// Notification thread_can_start; +// ... +// // The thread_can_start parameter is optional; you can supply NULL. +// ThreadWithParam thread(&ThreadFunc, 5, &thread_can_start); +// thread_can_start.Notify(); +// +// These classes are only for testing Google Test's own constructs. Do +// not use them in user tests, either directly or indirectly. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) + : func_(func), + param_(param), + thread_can_start_(thread_can_start), + finished_(false) { + ThreadWithParamBase* const base = this; + // The thread can be created only after all fields except thread_ + // have been initialized. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base)); + } + ~ThreadWithParam() { Join(); } + + void Join() { + if (!finished_) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0)); + finished_ = true; + } + } + + virtual void Run() { + if (thread_can_start_ != NULL) + thread_can_start_->WaitForNotification(); + func_(param_); + } + + private: + UserThreadFunc* const func_; // User-supplied thread function. + const T param_; // User-supplied parameter to the thread function. + // When non-NULL, used to block execution until the controller thread + // notifies. + Notification* const thread_can_start_; + bool finished_; // true iff we know that the thread function has finished. + pthread_t thread_; // The native thread object. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); +}; +# endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || + // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +// Mutex and ThreadLocal have already been imported into the namespace. +// Nothing to do here. + +# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +// Mutex implements mutex on Windows platforms. It is used in conjunction +// with class MutexLock: +// +// Mutex mutex; +// ... +// MutexLock lock(&mutex); // Acquires the mutex and releases it at the +// // end of the current scope. +// +// A static Mutex *must* be defined or declared using one of the following +// macros: +// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); +// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); +// +// (A non-static Mutex is defined/declared in the usual way). +class GTEST_API_ Mutex { + public: + enum MutexType { kStatic = 0, kDynamic = 1 }; + // We rely on kStaticMutex being 0 as it is to what the linker initializes + // type_ in static mutexes. critical_section_ will be initialized lazily + // in ThreadSafeLazyInit(). + enum StaticConstructorSelector { kStaticMutex = 0 }; + + // This constructor intentionally does nothing. It relies on type_ being + // statically initialized to 0 (effectively setting it to kStatic) and on + // ThreadSafeLazyInit() to lazily initialize the rest of the members. + explicit Mutex(StaticConstructorSelector /*dummy*/) {} + + Mutex(); + ~Mutex(); + + void Lock(); + + void Unlock(); + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld(); + + private: + // Initializes owner_thread_id_ and critical_section_ in static mutexes. + void ThreadSafeLazyInit(); + + // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx, + // we assume that 0 is an invalid value for thread IDs. + unsigned int owner_thread_id_; + + // For static mutexes, we rely on these members being initialized to zeros + // by the linker. + MutexType type_; + long critical_section_init_phase_; // NOLINT + _RTL_CRITICAL_SECTION* critical_section_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); +}; + +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex* mutex) + : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + Mutex* const mutex_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); +}; + +typedef GTestMutexLock MutexLock; + +// Base class for ValueHolder. Allows a caller to hold and delete a value +// without knowing its type. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Provides a way for a thread to send notifications to a ThreadLocal +// regardless of its parameter type. +class ThreadLocalBase { + public: + // Creates a new ValueHolder object holding a default value passed to + // this ThreadLocal's constructor and returns it. It is the caller's + // responsibility not to call this when the ThreadLocal instance already + // has a value on the current thread. + virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0; + + protected: + ThreadLocalBase() {} + virtual ~ThreadLocalBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase); +}; + +// Maps a thread to a set of ThreadLocals that have values instantiated on that +// thread and notifies them when the thread exits. A ThreadLocal instance is +// expected to persist until all threads it has values on have terminated. +class GTEST_API_ ThreadLocalRegistry { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase* GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance); + + // Invoked when a ThreadLocal instance is destroyed. + static void OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance); +}; + +class GTEST_API_ ThreadWithParamBase { + public: + void Join(); + + protected: + class Runnable { + public: + virtual ~Runnable() {} + virtual void Run() = 0; + }; + + ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start); + virtual ~ThreadWithParamBase(); + + private: + AutoHandle thread_; +}; + +// Helper class for testing Google Test's multi-threading constructs. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) + : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) { + } + virtual ~ThreadWithParam() {} + + private: + class RunnableImpl : public Runnable { + public: + RunnableImpl(UserThreadFunc* func, T param) + : func_(func), + param_(param) { + } + virtual ~RunnableImpl() {} + virtual void Run() { + func_(param_); + } + + private: + UserThreadFunc* const func_; + const T param_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl); + }; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); +}; + +// Implements thread-local storage on Windows systems. +// +// // Thread 1 +// ThreadLocal tl(100); // 100 is the default value for each thread. +// +// // Thread 2 +// tl.set(150); // Changes the value for thread 2 only. +// EXPECT_EQ(150, tl.get()); +// +// // Thread 1 +// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. +// tl.set(200); +// EXPECT_EQ(200, tl.get()); +// +// The template type argument T must have a public copy constructor. +// In addition, the default ThreadLocal constructor requires T to have +// a public default constructor. +// +// The users of a TheadLocal instance have to make sure that all but one +// threads (including the main one) using that instance have exited before +// destroying it. Otherwise, the per-thread objects managed for them by the +// ThreadLocal instance are not guaranteed to be destroyed on all platforms. +// +// Google Test only uses global ThreadLocal objects. That means they +// will die after main() has returned. Therefore, no per-thread +// object managed by Google Test will be leaked as long as all threads +// using Google Test have exited when main() returns. +template +class ThreadLocal : public ThreadLocalBase { + public: + ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T& value) + : default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of T. Can be deleted via its base class without the caller + // knowing the type of T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + }; + + + T* GetOrCreateValue() const { + return static_cast( + ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer(); + } + + virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const { + return default_factory_->MakeNewHolder(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T& value) : value_(value) {} + virtual ValueHolder* MakeNewHolder() const { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + }; + + scoped_ptr default_factory_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); +}; + +# elif GTEST_HAS_PTHREAD + +// MutexBase and Mutex implement mutex on pthreads-based platforms. +class MutexBase { + public: + // Acquires this mutex. + void Lock() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); + owner_ = pthread_self(); + has_owner_ = true; + } + + // Releases this mutex. + void Unlock() { + // Since the lock is being released the owner_ field should no longer be + // considered valid. We don't protect writing to has_owner_ here, as it's + // the caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + has_owner_ = false; + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); + } + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld() const { + GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self())) + << "The current thread is not holding the mutex @" << this; + } + + // A static mutex may be used before main() is entered. It may even + // be used before the dynamic initialization stage. Therefore we + // must be able to initialize a static mutex object at link time. + // This means MutexBase has to be a POD and its member variables + // have to be public. + public: + pthread_mutex_t mutex_; // The underlying pthread mutex. + // has_owner_ indicates whether the owner_ field below contains a valid thread + // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All + // accesses to the owner_ field should be protected by a check of this field. + // An alternative might be to memset() owner_ to all zeros, but there's no + // guarantee that a zero'd pthread_t is necessarily invalid or even different + // from pthread_self(). + bool has_owner_; + pthread_t owner_; // The thread holding the mutex. +}; + +// Forward-declares a static mutex. +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex + +// Defines and statically (i.e. at link time) initializes a static mutex. +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, pthread_t() } + +// The Mutex class can only be used for mutexes created at runtime. It +// shares its API with MutexBase otherwise. +class Mutex : public MutexBase { + public: + Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + has_owner_ = false; + } + ~Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); +}; + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(MutexBase* mutex) + : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + MutexBase* const mutex_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); +}; + +typedef GTestMutexLock MutexLock; + +// Helpers for ThreadLocal. + +// pthread_key_create() requires DeleteThreadLocalValue() to have +// C-linkage. Therefore it cannot be templatized to access +// ThreadLocal. Hence the need for class +// ThreadLocalValueHolderBase. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Called by pthread to delete thread-local data stored by +// pthread_setspecific(). +extern "C" inline void DeleteThreadLocalValue(void* value_holder) { + delete static_cast(value_holder); +} + +// Implements thread-local storage on pthreads-based systems. +template +class ThreadLocal { + public: + ThreadLocal() + : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T& value) + : key_(CreateKey()), + default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() { + // Destroys the managed object for the current thread, if any. + DeleteThreadLocalValue(pthread_getspecific(key_)); + + // Releases resources associated with the key. This will *not* + // delete managed objects for other threads. + GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); + } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of type T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + }; + + static pthread_key_t CreateKey() { + pthread_key_t key; + // When a thread exits, DeleteThreadLocalValue() will be called on + // the object managed for that thread. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_key_create(&key, &DeleteThreadLocalValue)); + return key; + } + + T* GetOrCreateValue() const { + ThreadLocalValueHolderBase* const holder = + static_cast(pthread_getspecific(key_)); + if (holder != NULL) { + return CheckedDowncastToActualType(holder)->pointer(); + } + + ValueHolder* const new_holder = default_factory_->MakeNewHolder(); + ThreadLocalValueHolderBase* const holder_base = new_holder; + GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); + return new_holder->pointer(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T& value) : value_(value) {} + virtual ValueHolder* MakeNewHolder() const { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + }; + + // A key pthreads uses for looking up per-thread values. + const pthread_key_t key_; + scoped_ptr default_factory_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); +}; + +# endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +#else // GTEST_IS_THREADSAFE + +// A dummy implementation of synchronization primitives (mutex, lock, +// and thread-local variable). Necessary for compiling Google Test where +// mutex is not supported - using Google Test in multiple threads is not +// supported on such platforms. + +class Mutex { + public: + Mutex() {} + void Lock() {} + void Unlock() {} + void AssertHeld() const {} +}; + +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex*) {} // NOLINT +}; + +typedef GTestMutexLock MutexLock; + +template +class ThreadLocal { + public: + ThreadLocal() : value_() {} + explicit ThreadLocal(const T& value) : value_(value) {} + T* pointer() { return &value_; } + const T* pointer() const { return &value_; } + const T& get() const { return value_; } + void set(const T& value) { value_ = value; } + private: + T value_; +}; + +#endif // GTEST_IS_THREADSAFE + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +GTEST_API_ size_t GetThreadCount(); + +// Passing non-POD classes through ellipsis (...) crashes the ARM +// compiler and generates a warning in Sun Studio. The Nokia Symbian +// and the IBM XL C/C++ compiler try to instantiate a copy constructor +// for objects passed through ellipsis (...), failing for uncopyable +// objects. We define this to ensure that only POD is passed through +// ellipsis on these systems. +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC) +// We lose support for NULL detection where the compiler doesn't like +// passing non-POD classes through ellipsis (...). +# define GTEST_ELLIPSIS_NEEDS_POD_ 1 +#else +# define GTEST_CAN_COMPARE_NULL 1 +#endif + +// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between +// const T& and const T* in a function template. These compilers +// _can_ decide between class template specializations for T and T*, +// so a tr1::type_traits-like is_pointer works. +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) +# define GTEST_NEEDS_IS_POINTER_ 1 +#endif + +template +struct bool_constant { + typedef bool_constant type; + static const bool value = bool_value; +}; +template const bool bool_constant::value; + +typedef bool_constant false_type; +typedef bool_constant true_type; + +template +struct is_pointer : public false_type {}; + +template +struct is_pointer : public true_type {}; + +template +struct IteratorTraits { + typedef typename Iterator::value_type value_type; +}; + +template +struct IteratorTraits { + typedef T value_type; +}; + +template +struct IteratorTraits { + typedef T value_type; +}; + +#if GTEST_OS_WINDOWS +# define GTEST_PATH_SEP_ "\\" +# define GTEST_HAS_ALT_PATH_SEP_ 1 +// The biggest signed integer type the compiler supports. +typedef __int64 BiggestInt; +#else +# define GTEST_PATH_SEP_ "/" +# define GTEST_HAS_ALT_PATH_SEP_ 0 +typedef long long BiggestInt; // NOLINT +#endif // GTEST_OS_WINDOWS + +// Utilities for char. + +// isspace(int ch) and friends accept an unsigned char or EOF. char +// may be signed, depending on the compiler (or compiler flags). +// Therefore we need to cast a char to unsigned char before calling +// isspace(), etc. + +inline bool IsAlpha(char ch) { + return isalpha(static_cast(ch)) != 0; +} +inline bool IsAlNum(char ch) { + return isalnum(static_cast(ch)) != 0; +} +inline bool IsDigit(char ch) { + return isdigit(static_cast(ch)) != 0; +} +inline bool IsLower(char ch) { + return islower(static_cast(ch)) != 0; +} +inline bool IsSpace(char ch) { + return isspace(static_cast(ch)) != 0; +} +inline bool IsUpper(char ch) { + return isupper(static_cast(ch)) != 0; +} +inline bool IsXDigit(char ch) { + return isxdigit(static_cast(ch)) != 0; +} +inline bool IsXDigit(wchar_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} + +inline char ToLower(char ch) { + return static_cast(tolower(static_cast(ch))); +} +inline char ToUpper(char ch) { + return static_cast(toupper(static_cast(ch))); +} + +inline std::string StripTrailingSpaces(std::string str) { + std::string::iterator it = str.end(); + while (it != str.begin() && IsSpace(*--it)) + it = str.erase(it); + return str; +} + +// The testing::internal::posix namespace holds wrappers for common +// POSIX functions. These wrappers hide the differences between +// Windows/MSVC and POSIX systems. Since some compilers define these +// standard functions as macros, the wrapper cannot have the same name +// as the wrapped function. + +namespace posix { + +// Functions with a different name on Windows. + +#if GTEST_OS_WINDOWS + +typedef struct _stat StatStruct; + +# ifdef __BORLANDC__ +inline int IsATTY(int fd) { return isatty(fd); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +# else // !__BORLANDC__ +# if GTEST_OS_WINDOWS_MOBILE +inline int IsATTY(int /* fd */) { return 0; } +# else +inline int IsATTY(int fd) { return _isatty(fd); } +# endif // GTEST_OS_WINDOWS_MOBILE +inline int StrCaseCmp(const char* s1, const char* s2) { + return _stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return _strdup(src); } +# endif // __BORLANDC__ + +# if GTEST_OS_WINDOWS_MOBILE +inline int FileNo(FILE* file) { return reinterpret_cast(_fileno(file)); } +// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this +// time and thus not defined there. +# else +inline int FileNo(FILE* file) { return _fileno(file); } +inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } +inline int RmDir(const char* dir) { return _rmdir(dir); } +inline bool IsDir(const StatStruct& st) { + return (_S_IFDIR & st.st_mode) != 0; +} +# endif // GTEST_OS_WINDOWS_MOBILE + +#else + +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int IsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + +#endif // GTEST_OS_WINDOWS + +// Functions deprecated by MSVC 8.0. + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */) + +inline const char* StrNCpy(char* dest, const char* src, size_t n) { + return strncpy(dest, src, n); +} + +// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and +// StrError() aren't needed on Windows CE at this time and thus not +// defined there. + +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT +inline int ChDir(const char* dir) { return chdir(dir); } +#endif +inline FILE* FOpen(const char* path, const char* mode) { + return fopen(path, mode); +} +#if !GTEST_OS_WINDOWS_MOBILE +inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { + return freopen(path, mode, stream); +} +inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } +#endif +inline int FClose(FILE* fp) { return fclose(fp); } +#if !GTEST_OS_WINDOWS_MOBILE +inline int Read(int fd, void* buf, unsigned int count) { + return static_cast(read(fd, buf, count)); +} +inline int Write(int fd, const void* buf, unsigned int count) { + return static_cast(write(fd, buf, count)); +} +inline int Close(int fd) { return close(fd); } +inline const char* StrError(int errnum) { return strerror(errnum); } +#endif +inline const char* GetEnv(const char* name) { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT + // We are on Windows CE, which has no environment variables. + static_cast(name); // To prevent 'unused argument' warning. + return NULL; +#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) + // Environment variables which we programmatically clear will be set to the + // empty string rather than unset (NULL). Handle that case. + const char* const env = getenv(name); + return (env != NULL && env[0] != '\0') ? env : NULL; +#else + return getenv(name); +#endif +} + +GTEST_DISABLE_MSC_WARNINGS_POP_() + +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE has no C library. The abort() function is used in +// several places in Google Test. This implementation provides a reasonable +// imitation of standard behaviour. +void Abort(); +#else +inline void Abort() { abort(); } +#endif // GTEST_OS_WINDOWS_MOBILE + +} // namespace posix + +// MSVC "deprecates" snprintf and issues warnings wherever it is used. In +// order to avoid these warnings, we need to use _snprintf or _snprintf_s on +// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate +// function in order to achieve that. We use macro definition here because +// snprintf is a variadic function. +#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE +// MSVC 2005 and above support variadic macros. +# define GTEST_SNPRINTF_(buffer, size, format, ...) \ + _snprintf_s(buffer, size, size, format, __VA_ARGS__) +#elif defined(_MSC_VER) +// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't +// complain about _snprintf. +# define GTEST_SNPRINTF_ _snprintf +#else +# define GTEST_SNPRINTF_ snprintf +#endif + +// The maximum number a BiggestInt can represent. This definition +// works no matter BiggestInt is represented in one's complement or +// two's complement. +// +// We cannot rely on numeric_limits in STL, as __int64 and long long +// are not part of standard C++ and numeric_limits doesn't need to be +// defined for them. +const BiggestInt kMaxBiggestInt = + ~(static_cast(1) << (8*sizeof(BiggestInt) - 1)); + +// This template class serves as a compile-time function from size to +// type. It maps a size in bytes to a primitive type with that +// size. e.g. +// +// TypeWithSize<4>::UInt +// +// is typedef-ed to be unsigned int (unsigned integer made up of 4 +// bytes). +// +// Such functionality should belong to STL, but I cannot find it +// there. +// +// Google Test uses this class in the implementation of floating-point +// comparison. +// +// For now it only handles UInt (unsigned int) as that's all Google Test +// needs. Other types can be easily added in the future if need +// arises. +template +class TypeWithSize { + public: + // This prevents the user from using TypeWithSize with incorrect + // values of N. + typedef void UInt; +}; + +// The specialization for size 4. +template <> +class TypeWithSize<4> { + public: + // unsigned int has size 4 in both gcc and MSVC. + // + // As base/basictypes.h doesn't compile on Windows, we cannot use + // uint32, uint64, and etc here. + typedef int Int; + typedef unsigned int UInt; +}; + +// The specialization for size 8. +template <> +class TypeWithSize<8> { + public: +#if GTEST_OS_WINDOWS + typedef __int64 Int; + typedef unsigned __int64 UInt; +#else + typedef long long Int; // NOLINT + typedef unsigned long long UInt; // NOLINT +#endif // GTEST_OS_WINDOWS +}; + +// Integer types of known sizes. +typedef TypeWithSize<4>::Int Int32; +typedef TypeWithSize<4>::UInt UInt32; +typedef TypeWithSize<8>::Int Int64; +typedef TypeWithSize<8>::UInt UInt64; +typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. + +// Utilities for command line flags and environment variables. + +// Macro for referencing flags. +#if !defined(GTEST_FLAG) +# define GTEST_FLAG(name) FLAGS_gtest_##name +#endif // !defined(GTEST_FLAG) + +#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) +# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 +#endif // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) + +#if !defined(GTEST_DECLARE_bool_) +# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver + +// Macros for declaring flags. +# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) +# define GTEST_DECLARE_int32_(name) \ + GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name) +#define GTEST_DECLARE_string_(name) \ + GTEST_API_ extern ::std::string GTEST_FLAG(name) + +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_string_(name, default_val, doc) \ + GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) + +#endif // !defined(GTEST_DECLARE_bool_) + +// Thread annotations +#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) +# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +# define GTEST_LOCK_EXCLUDED_(locks) +#endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) + +// Parses 'str' for a 32-bit signed integer. If successful, writes the result +// to *value and returns true; otherwise leaves *value unchanged and returns +// false. +// TODO(chandlerc): Find a better way to refactor flag and environment parsing +// out of both gtest-port.cc and gtest.cc to avoid exporting this utility +// function. +bool ParseInt32(const Message& src_text, const char* str, Int32* value); + +// Parses a bool/Int32/string from the environment variable +// corresponding to the given Google Test flag. +bool BoolFromGTestEnv(const char* flag, bool default_val); +GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val); +std::string StringFromGTestEnv(const char* flag, const char* default_val); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h new file mode 100644 index 0000000000..97f1a7fdd2 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -0,0 +1,167 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file declares the String class and functions used internally by +// Google Test. They are subject to change without notice. They should not used +// by code external to Google Test. +// +// This header file is #included by . +// It should not be #included by other files. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ + +#ifdef __BORLANDC__ +// string.h is not guaranteed to provide strcpy on C++ Builder. +# include +#endif + +#include +#include + +#include "gtest/internal/gtest-port.h" + +namespace testing { +namespace internal { + +// String - an abstract class holding static string utilities. +class GTEST_API_ String { + public: + // Static utility methods + + // Clones a 0-terminated C string, allocating memory using new. The + // caller is responsible for deleting the return value using + // delete[]. Returns the cloned string, or NULL if the input is + // NULL. + // + // This is different from strdup() in string.h, which allocates + // memory using malloc(). + static const char* CloneCString(const char* c_str); + +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be + // able to pass strings to Win32 APIs on CE we need to convert them + // to 'Unicode', UTF-16. + + // Creates a UTF-16 wide string from the given ANSI string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the wide string, or NULL if the + // input is NULL. + // + // The wide string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static LPCWSTR AnsiToUtf16(const char* c_str); + + // Creates an ANSI string from the given wide string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the ANSI string, or NULL if the + // input is NULL. + // + // The returned string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static const char* Utf16ToAnsi(LPCWSTR utf16_str); +#endif + + // Compares two C strings. Returns true iff they have the same content. + // + // Unlike strcmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CStringEquals(const char* lhs, const char* rhs); + + // Converts a wide C string to a String using the UTF-8 encoding. + // NULL will be converted to "(null)". If an error occurred during + // the conversion, "(failed to convert from wide string)" is + // returned. + static std::string ShowWideCString(const wchar_t* wide_c_str); + + // Compares two wide C strings. Returns true iff they have the same + // content. + // + // Unlike wcscmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); + + // Compares two C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike strcasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CaseInsensitiveCStringEquals(const char* lhs, + const char* rhs); + + // Compares two wide C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. + static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs); + + // Returns true iff the given string ends with the given suffix, ignoring + // case. Any string is considered to end with an empty suffix. + static bool EndsWithCaseInsensitive( + const std::string& str, const std::string& suffix); + + // Formats an int value as "%02d". + static std::string FormatIntWidth2(int value); // "%02d" for width == 2 + + // Formats an int value as "%X". + static std::string FormatHexInt(int value); + + // Formats a byte as "%02X". + static std::string FormatByte(unsigned char value); + + private: + String(); // Not meant to be instantiated. +}; // class String + +// Gets the content of the stringstream's buffer as an std::string. Each '\0' +// character in the buffer is replaced with "\\0". +GTEST_API_ std::string StringStreamToString(::std::stringstream* stream); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h new file mode 100644 index 0000000000..e9b405340a --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h @@ -0,0 +1,1020 @@ +// This file was GENERATED by command: +// pump.py gtest-tuple.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2009 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Implements a subset of TR1 tuple needed by Google Test and Google Mock. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ + +#include // For ::std::pair. + +// The compiler used in Symbian has a bug that prevents us from declaring the +// tuple template as a friend (it complains that tuple is redefined). This +// hack bypasses the bug by declaring the members that should otherwise be +// private as public. +// Sun Studio versions < 12 also have the above bug. +#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: +#else +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ + template friend class tuple; \ + private: +#endif + +// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict +// with our own definitions. Therefore using our own tuple does not work on +// those compilers. +#if defined(_MSC_VER) && _MSC_VER >= 1600 /* 1600 is Visual Studio 2010 */ +# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \ +GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers." +#endif + +// GTEST_n_TUPLE_(T) is the type of an n-tuple. +#define GTEST_0_TUPLE_(T) tuple<> +#define GTEST_1_TUPLE_(T) tuple +#define GTEST_2_TUPLE_(T) tuple +#define GTEST_3_TUPLE_(T) tuple +#define GTEST_4_TUPLE_(T) tuple +#define GTEST_5_TUPLE_(T) tuple +#define GTEST_6_TUPLE_(T) tuple +#define GTEST_7_TUPLE_(T) tuple +#define GTEST_8_TUPLE_(T) tuple +#define GTEST_9_TUPLE_(T) tuple +#define GTEST_10_TUPLE_(T) tuple + +// GTEST_n_TYPENAMES_(T) declares a list of n typenames. +#define GTEST_0_TYPENAMES_(T) +#define GTEST_1_TYPENAMES_(T) typename T##0 +#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1 +#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2 +#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3 +#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4 +#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5 +#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6 +#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, typename T##7 +#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, \ + typename T##7, typename T##8 +#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, \ + typename T##7, typename T##8, typename T##9 + +// In theory, defining stuff in the ::std namespace is undefined +// behavior. We can do this as we are playing the role of a standard +// library vendor. +namespace std { +namespace tr1 { + +template +class tuple; + +// Anything in namespace gtest_internal is Google Test's INTERNAL +// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. +namespace gtest_internal { + +// ByRef::type is T if T is a reference; otherwise it's const T&. +template +struct ByRef { typedef const T& type; }; // NOLINT +template +struct ByRef { typedef T& type; }; // NOLINT + +// A handy wrapper for ByRef. +#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type + +// AddRef::type is T if T is a reference; otherwise it's T&. This +// is the same as tr1::add_reference::type. +template +struct AddRef { typedef T& type; }; // NOLINT +template +struct AddRef { typedef T& type; }; // NOLINT + +// A handy wrapper for AddRef. +#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type + +// A helper for implementing get(). +template class Get; + +// A helper for implementing tuple_element. kIndexValid is true +// iff k < the number of fields in tuple type T. +template +struct TupleElement; + +template +struct TupleElement { + typedef T0 type; +}; + +template +struct TupleElement { + typedef T1 type; +}; + +template +struct TupleElement { + typedef T2 type; +}; + +template +struct TupleElement { + typedef T3 type; +}; + +template +struct TupleElement { + typedef T4 type; +}; + +template +struct TupleElement { + typedef T5 type; +}; + +template +struct TupleElement { + typedef T6 type; +}; + +template +struct TupleElement { + typedef T7 type; +}; + +template +struct TupleElement { + typedef T8 type; +}; + +template +struct TupleElement { + typedef T9 type; +}; + +} // namespace gtest_internal + +template <> +class tuple<> { + public: + tuple() {} + tuple(const tuple& /* t */) {} + tuple& operator=(const tuple& /* t */) { return *this; } +}; + +template +class GTEST_1_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {} + + tuple(const tuple& t) : f0_(t.f0_) {} + + template + tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_1_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) { + f0_ = t.f0_; + return *this; + } + + T0 f0_; +}; + +template +class GTEST_2_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0), + f1_(f1) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {} + + template + tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {} + template + tuple(const ::std::pair& p) : f0_(p.first), f1_(p.second) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_2_TUPLE_(U)& t) { + return CopyFrom(t); + } + template + tuple& operator=(const ::std::pair& p) { + f0_ = p.first; + f1_ = p.second; + return *this; + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + return *this; + } + + T0 f0_; + T1 f1_; +}; + +template +class GTEST_3_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} + + template + tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_3_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; +}; + +template +class GTEST_4_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {} + + template + tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_4_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; +}; + +template +class GTEST_5_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, + GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_) {} + + template + tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_5_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; +}; + +template +class GTEST_6_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_) {} + + template + tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_6_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; +}; + +template +class GTEST_7_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3), f4_(f4), f5_(f5), f6_(f6) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} + + template + tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_7_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; +}; + +template +class GTEST_8_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, + GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5), f6_(f6), f7_(f7) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} + + template + tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_8_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; +}; + +template +class GTEST_9_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, + GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5), f6_(f6), f7_(f7), f8_(f8) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} + + template + tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_9_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + f8_ = t.f8_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; + T8 f8_; +}; + +template +class tuple { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(), + f9_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, + GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {} + + template + tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), + f9_(t.f9_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_10_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + f8_ = t.f8_; + f9_ = t.f9_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; + T8 f8_; + T9 f9_; +}; + +// 6.1.3.2 Tuple creation functions. + +// Known limitations: we don't support passing an +// std::tr1::reference_wrapper to make_tuple(). And we don't +// implement tie(). + +inline tuple<> make_tuple() { return tuple<>(); } + +template +inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) { + return GTEST_1_TUPLE_(T)(f0); +} + +template +inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) { + return GTEST_2_TUPLE_(T)(f0, f1); +} + +template +inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) { + return GTEST_3_TUPLE_(T)(f0, f1, f2); +} + +template +inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3) { + return GTEST_4_TUPLE_(T)(f0, f1, f2, f3); +} + +template +inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4) { + return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4); +} + +template +inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5) { + return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5); +} + +template +inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6) { + return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6); +} + +template +inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) { + return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7); +} + +template +inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, + const T8& f8) { + return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8); +} + +template +inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, + const T8& f8, const T9& f9) { + return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9); +} + +// 6.1.3.3 Tuple helper classes. + +template struct tuple_size; + +template +struct tuple_size { + static const int value = 0; +}; + +template +struct tuple_size { + static const int value = 1; +}; + +template +struct tuple_size { + static const int value = 2; +}; + +template +struct tuple_size { + static const int value = 3; +}; + +template +struct tuple_size { + static const int value = 4; +}; + +template +struct tuple_size { + static const int value = 5; +}; + +template +struct tuple_size { + static const int value = 6; +}; + +template +struct tuple_size { + static const int value = 7; +}; + +template +struct tuple_size { + static const int value = 8; +}; + +template +struct tuple_size { + static const int value = 9; +}; + +template +struct tuple_size { + static const int value = 10; +}; + +template +struct tuple_element { + typedef typename gtest_internal::TupleElement< + k < (tuple_size::value), k, Tuple>::type type; +}; + +#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element::type + +// 6.1.3.4 Element access. + +namespace gtest_internal { + +template <> +class Get<0> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) + Field(Tuple& t) { return t.f0_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) + ConstField(const Tuple& t) { return t.f0_; } +}; + +template <> +class Get<1> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) + Field(Tuple& t) { return t.f1_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) + ConstField(const Tuple& t) { return t.f1_; } +}; + +template <> +class Get<2> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) + Field(Tuple& t) { return t.f2_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) + ConstField(const Tuple& t) { return t.f2_; } +}; + +template <> +class Get<3> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) + Field(Tuple& t) { return t.f3_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) + ConstField(const Tuple& t) { return t.f3_; } +}; + +template <> +class Get<4> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) + Field(Tuple& t) { return t.f4_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) + ConstField(const Tuple& t) { return t.f4_; } +}; + +template <> +class Get<5> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) + Field(Tuple& t) { return t.f5_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) + ConstField(const Tuple& t) { return t.f5_; } +}; + +template <> +class Get<6> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) + Field(Tuple& t) { return t.f6_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) + ConstField(const Tuple& t) { return t.f6_; } +}; + +template <> +class Get<7> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) + Field(Tuple& t) { return t.f7_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) + ConstField(const Tuple& t) { return t.f7_; } +}; + +template <> +class Get<8> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) + Field(Tuple& t) { return t.f8_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) + ConstField(const Tuple& t) { return t.f8_; } +}; + +template <> +class Get<9> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) + Field(Tuple& t) { return t.f9_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) + ConstField(const Tuple& t) { return t.f9_; } +}; + +} // namespace gtest_internal + +template +GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) +get(GTEST_10_TUPLE_(T)& t) { + return gtest_internal::Get::Field(t); +} + +template +GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) +get(const GTEST_10_TUPLE_(T)& t) { + return gtest_internal::Get::ConstField(t); +} + +// 6.1.3.5 Relational operators + +// We only implement == and !=, as we don't have a need for the rest yet. + +namespace gtest_internal { + +// SameSizeTuplePrefixComparator::Eq(t1, t2) returns true if the +// first k fields of t1 equals the first k fields of t2. +// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if +// k1 != k2. +template +struct SameSizeTuplePrefixComparator; + +template <> +struct SameSizeTuplePrefixComparator<0, 0> { + template + static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { + return true; + } +}; + +template +struct SameSizeTuplePrefixComparator { + template + static bool Eq(const Tuple1& t1, const Tuple2& t2) { + return SameSizeTuplePrefixComparator::Eq(t1, t2) && + ::std::tr1::get(t1) == ::std::tr1::get(t2); + } +}; + +} // namespace gtest_internal + +template +inline bool operator==(const GTEST_10_TUPLE_(T)& t, + const GTEST_10_TUPLE_(U)& u) { + return gtest_internal::SameSizeTuplePrefixComparator< + tuple_size::value, + tuple_size::value>::Eq(t, u); +} + +template +inline bool operator!=(const GTEST_10_TUPLE_(T)& t, + const GTEST_10_TUPLE_(U)& u) { return !(t == u); } + +// 6.1.4 Pairs. +// Unimplemented. + +} // namespace tr1 +} // namespace std + +#undef GTEST_0_TUPLE_ +#undef GTEST_1_TUPLE_ +#undef GTEST_2_TUPLE_ +#undef GTEST_3_TUPLE_ +#undef GTEST_4_TUPLE_ +#undef GTEST_5_TUPLE_ +#undef GTEST_6_TUPLE_ +#undef GTEST_7_TUPLE_ +#undef GTEST_8_TUPLE_ +#undef GTEST_9_TUPLE_ +#undef GTEST_10_TUPLE_ + +#undef GTEST_0_TYPENAMES_ +#undef GTEST_1_TYPENAMES_ +#undef GTEST_2_TYPENAMES_ +#undef GTEST_3_TYPENAMES_ +#undef GTEST_4_TYPENAMES_ +#undef GTEST_5_TYPENAMES_ +#undef GTEST_6_TYPENAMES_ +#undef GTEST_7_TYPENAMES_ +#undef GTEST_8_TYPENAMES_ +#undef GTEST_9_TYPENAMES_ +#undef GTEST_10_TYPENAMES_ + +#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ +#undef GTEST_BY_REF_ +#undef GTEST_ADD_REF_ +#undef GTEST_TUPLE_ELEMENT_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump new file mode 100644 index 0000000000..429ddfeeca --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump @@ -0,0 +1,347 @@ +$$ -*- mode: c++; -*- +$var n = 10 $$ Maximum number of tuple fields we want to support. +$$ This meta comment fixes auto-indentation in Emacs. }} +// Copyright 2009 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Implements a subset of TR1 tuple needed by Google Test and Google Mock. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ + +#include // For ::std::pair. + +// The compiler used in Symbian has a bug that prevents us from declaring the +// tuple template as a friend (it complains that tuple is redefined). This +// hack bypasses the bug by declaring the members that should otherwise be +// private as public. +// Sun Studio versions < 12 also have the above bug. +#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: +#else +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ + template friend class tuple; \ + private: +#endif + +// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict +// with our own definitions. Therefore using our own tuple does not work on +// those compilers. +#if defined(_MSC_VER) && _MSC_VER >= 1600 /* 1600 is Visual Studio 2010 */ +# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \ +GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers." +#endif + + +$range i 0..n-1 +$range j 0..n +$range k 1..n +// GTEST_n_TUPLE_(T) is the type of an n-tuple. +#define GTEST_0_TUPLE_(T) tuple<> + +$for k [[ +$range m 0..k-1 +$range m2 k..n-1 +#define GTEST_$(k)_TUPLE_(T) tuple<$for m, [[T##$m]]$for m2 [[, void]]> + +]] + +// GTEST_n_TYPENAMES_(T) declares a list of n typenames. + +$for j [[ +$range m 0..j-1 +#define GTEST_$(j)_TYPENAMES_(T) $for m, [[typename T##$m]] + + +]] + +// In theory, defining stuff in the ::std namespace is undefined +// behavior. We can do this as we are playing the role of a standard +// library vendor. +namespace std { +namespace tr1 { + +template <$for i, [[typename T$i = void]]> +class tuple; + +// Anything in namespace gtest_internal is Google Test's INTERNAL +// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. +namespace gtest_internal { + +// ByRef::type is T if T is a reference; otherwise it's const T&. +template +struct ByRef { typedef const T& type; }; // NOLINT +template +struct ByRef { typedef T& type; }; // NOLINT + +// A handy wrapper for ByRef. +#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type + +// AddRef::type is T if T is a reference; otherwise it's T&. This +// is the same as tr1::add_reference::type. +template +struct AddRef { typedef T& type; }; // NOLINT +template +struct AddRef { typedef T& type; }; // NOLINT + +// A handy wrapper for AddRef. +#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type + +// A helper for implementing get(). +template class Get; + +// A helper for implementing tuple_element. kIndexValid is true +// iff k < the number of fields in tuple type T. +template +struct TupleElement; + + +$for i [[ +template +struct TupleElement { + typedef T$i type; +}; + + +]] +} // namespace gtest_internal + +template <> +class tuple<> { + public: + tuple() {} + tuple(const tuple& /* t */) {} + tuple& operator=(const tuple& /* t */) { return *this; } +}; + + +$for k [[ +$range m 0..k-1 +template +class $if k < n [[GTEST_$(k)_TUPLE_(T)]] $else [[tuple]] { + public: + template friend class gtest_internal::Get; + + tuple() : $for m, [[f$(m)_()]] {} + + explicit tuple($for m, [[GTEST_BY_REF_(T$m) f$m]]) : [[]] +$for m, [[f$(m)_(f$m)]] {} + + tuple(const tuple& t) : $for m, [[f$(m)_(t.f$(m)_)]] {} + + template + tuple(const GTEST_$(k)_TUPLE_(U)& t) : $for m, [[f$(m)_(t.f$(m)_)]] {} + +$if k == 2 [[ + template + tuple(const ::std::pair& p) : f0_(p.first), f1_(p.second) {} + +]] + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_$(k)_TUPLE_(U)& t) { + return CopyFrom(t); + } + +$if k == 2 [[ + template + tuple& operator=(const ::std::pair& p) { + f0_ = p.first; + f1_ = p.second; + return *this; + } + +]] + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_$(k)_TUPLE_(U)& t) { + +$for m [[ + f$(m)_ = t.f$(m)_; + +]] + return *this; + } + + +$for m [[ + T$m f$(m)_; + +]] +}; + + +]] +// 6.1.3.2 Tuple creation functions. + +// Known limitations: we don't support passing an +// std::tr1::reference_wrapper to make_tuple(). And we don't +// implement tie(). + +inline tuple<> make_tuple() { return tuple<>(); } + +$for k [[ +$range m 0..k-1 + +template +inline GTEST_$(k)_TUPLE_(T) make_tuple($for m, [[const T$m& f$m]]) { + return GTEST_$(k)_TUPLE_(T)($for m, [[f$m]]); +} + +]] + +// 6.1.3.3 Tuple helper classes. + +template struct tuple_size; + + +$for j [[ +template +struct tuple_size { + static const int value = $j; +}; + + +]] +template +struct tuple_element { + typedef typename gtest_internal::TupleElement< + k < (tuple_size::value), k, Tuple>::type type; +}; + +#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element::type + +// 6.1.3.4 Element access. + +namespace gtest_internal { + + +$for i [[ +template <> +class Get<$i> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple)) + Field(Tuple& t) { return t.f$(i)_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple)) + ConstField(const Tuple& t) { return t.f$(i)_; } +}; + + +]] +} // namespace gtest_internal + +template +GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_$(n)_TUPLE_(T))) +get(GTEST_$(n)_TUPLE_(T)& t) { + return gtest_internal::Get::Field(t); +} + +template +GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_$(n)_TUPLE_(T))) +get(const GTEST_$(n)_TUPLE_(T)& t) { + return gtest_internal::Get::ConstField(t); +} + +// 6.1.3.5 Relational operators + +// We only implement == and !=, as we don't have a need for the rest yet. + +namespace gtest_internal { + +// SameSizeTuplePrefixComparator::Eq(t1, t2) returns true if the +// first k fields of t1 equals the first k fields of t2. +// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if +// k1 != k2. +template +struct SameSizeTuplePrefixComparator; + +template <> +struct SameSizeTuplePrefixComparator<0, 0> { + template + static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { + return true; + } +}; + +template +struct SameSizeTuplePrefixComparator { + template + static bool Eq(const Tuple1& t1, const Tuple2& t2) { + return SameSizeTuplePrefixComparator::Eq(t1, t2) && + ::std::tr1::get(t1) == ::std::tr1::get(t2); + } +}; + +} // namespace gtest_internal + +template +inline bool operator==(const GTEST_$(n)_TUPLE_(T)& t, + const GTEST_$(n)_TUPLE_(U)& u) { + return gtest_internal::SameSizeTuplePrefixComparator< + tuple_size::value, + tuple_size::value>::Eq(t, u); +} + +template +inline bool operator!=(const GTEST_$(n)_TUPLE_(T)& t, + const GTEST_$(n)_TUPLE_(U)& u) { return !(t == u); } + +// 6.1.4 Pairs. +// Unimplemented. + +} // namespace tr1 +} // namespace std + + +$for j [[ +#undef GTEST_$(j)_TUPLE_ + +]] + + +$for j [[ +#undef GTEST_$(j)_TYPENAMES_ + +]] + +#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ +#undef GTEST_BY_REF_ +#undef GTEST_ADD_REF_ +#undef GTEST_TUPLE_ELEMENT_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h new file mode 100644 index 0000000000..e46f7cfcb4 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -0,0 +1,3331 @@ +// This file was GENERATED by command: +// pump.py gtest-type-util.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Type utilities needed for implementing typed and type-parameterized +// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently we support at most 50 types in a list, and at most 50 +// type-parameterized tests in one type-parameterized test case. +// Please contact googletestframework@googlegroups.com if you need +// more. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + +#include "gtest/internal/gtest-port.h" + +// #ifdef __GNUC__ is too general here. It is possible to use gcc without using +// libstdc++ (which is where cxxabi.h comes from). +# if GTEST_HAS_CXXABI_H_ +# include +# elif defined(__HP_aCC) +# include +# endif // GTEST_HASH_CXXABI_H_ + +namespace testing { +namespace internal { + +// GetTypeName() returns a human-readable name of type T. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template +std::string GetTypeName() { +# if GTEST_HAS_RTTI + + const char* const name = typeid(T).name(); +# if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) + int status = 0; + // gcc's implementation of typeid(T).name() mangles the type name, + // so we have to demangle it. +# if GTEST_HAS_CXXABI_H_ + using abi::__cxa_demangle; +# endif // GTEST_HAS_CXXABI_H_ + char* const readable_name = __cxa_demangle(name, 0, 0, &status); + const std::string name_str(status == 0 ? readable_name : name); + free(readable_name); + return name_str; +# else + return name; +# endif // GTEST_HAS_CXXABI_H_ || __HP_aCC + +# else + + return ""; + +# endif // GTEST_HAS_RTTI +} + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// AssertyTypeEq::type is defined iff T1 and T2 are the same +// type. This can be used as a compile-time assertion to ensure that +// two types are equal. + +template +struct AssertTypeEq; + +template +struct AssertTypeEq { + typedef bool type; +}; + +// A unique type used as the default value for the arguments of class +// template Types. This allows us to simulate variadic templates +// (e.g. Types, Type, and etc), which C++ doesn't +// support directly. +struct None {}; + +// The following family of struct and struct templates are used to +// represent type lists. In particular, TypesN +// represents a type list with N types (T1, T2, ..., and TN) in it. +// Except for Types0, every struct in the family has two member types: +// Head for the first type in the list, and Tail for the rest of the +// list. + +// The empty type list. +struct Types0 {}; + +// Type lists of length 1, 2, 3, and so on. + +template +struct Types1 { + typedef T1 Head; + typedef Types0 Tail; +}; +template +struct Types2 { + typedef T1 Head; + typedef Types1 Tail; +}; + +template +struct Types3 { + typedef T1 Head; + typedef Types2 Tail; +}; + +template +struct Types4 { + typedef T1 Head; + typedef Types3 Tail; +}; + +template +struct Types5 { + typedef T1 Head; + typedef Types4 Tail; +}; + +template +struct Types6 { + typedef T1 Head; + typedef Types5 Tail; +}; + +template +struct Types7 { + typedef T1 Head; + typedef Types6 Tail; +}; + +template +struct Types8 { + typedef T1 Head; + typedef Types7 Tail; +}; + +template +struct Types9 { + typedef T1 Head; + typedef Types8 Tail; +}; + +template +struct Types10 { + typedef T1 Head; + typedef Types9 Tail; +}; + +template +struct Types11 { + typedef T1 Head; + typedef Types10 Tail; +}; + +template +struct Types12 { + typedef T1 Head; + typedef Types11 Tail; +}; + +template +struct Types13 { + typedef T1 Head; + typedef Types12 Tail; +}; + +template +struct Types14 { + typedef T1 Head; + typedef Types13 Tail; +}; + +template +struct Types15 { + typedef T1 Head; + typedef Types14 Tail; +}; + +template +struct Types16 { + typedef T1 Head; + typedef Types15 Tail; +}; + +template +struct Types17 { + typedef T1 Head; + typedef Types16 Tail; +}; + +template +struct Types18 { + typedef T1 Head; + typedef Types17 Tail; +}; + +template +struct Types19 { + typedef T1 Head; + typedef Types18 Tail; +}; + +template +struct Types20 { + typedef T1 Head; + typedef Types19 Tail; +}; + +template +struct Types21 { + typedef T1 Head; + typedef Types20 Tail; +}; + +template +struct Types22 { + typedef T1 Head; + typedef Types21 Tail; +}; + +template +struct Types23 { + typedef T1 Head; + typedef Types22 Tail; +}; + +template +struct Types24 { + typedef T1 Head; + typedef Types23 Tail; +}; + +template +struct Types25 { + typedef T1 Head; + typedef Types24 Tail; +}; + +template +struct Types26 { + typedef T1 Head; + typedef Types25 Tail; +}; + +template +struct Types27 { + typedef T1 Head; + typedef Types26 Tail; +}; + +template +struct Types28 { + typedef T1 Head; + typedef Types27 Tail; +}; + +template +struct Types29 { + typedef T1 Head; + typedef Types28 Tail; +}; + +template +struct Types30 { + typedef T1 Head; + typedef Types29 Tail; +}; + +template +struct Types31 { + typedef T1 Head; + typedef Types30 Tail; +}; + +template +struct Types32 { + typedef T1 Head; + typedef Types31 Tail; +}; + +template +struct Types33 { + typedef T1 Head; + typedef Types32 Tail; +}; + +template +struct Types34 { + typedef T1 Head; + typedef Types33 Tail; +}; + +template +struct Types35 { + typedef T1 Head; + typedef Types34 Tail; +}; + +template +struct Types36 { + typedef T1 Head; + typedef Types35 Tail; +}; + +template +struct Types37 { + typedef T1 Head; + typedef Types36 Tail; +}; + +template +struct Types38 { + typedef T1 Head; + typedef Types37 Tail; +}; + +template +struct Types39 { + typedef T1 Head; + typedef Types38 Tail; +}; + +template +struct Types40 { + typedef T1 Head; + typedef Types39 Tail; +}; + +template +struct Types41 { + typedef T1 Head; + typedef Types40 Tail; +}; + +template +struct Types42 { + typedef T1 Head; + typedef Types41 Tail; +}; + +template +struct Types43 { + typedef T1 Head; + typedef Types42 Tail; +}; + +template +struct Types44 { + typedef T1 Head; + typedef Types43 Tail; +}; + +template +struct Types45 { + typedef T1 Head; + typedef Types44 Tail; +}; + +template +struct Types46 { + typedef T1 Head; + typedef Types45 Tail; +}; + +template +struct Types47 { + typedef T1 Head; + typedef Types46 Tail; +}; + +template +struct Types48 { + typedef T1 Head; + typedef Types47 Tail; +}; + +template +struct Types49 { + typedef T1 Head; + typedef Types48 Tail; +}; + +template +struct Types50 { + typedef T1 Head; + typedef Types49 Tail; +}; + + +} // namespace internal + +// We don't want to require the users to write TypesN<...> directly, +// as that would require them to count the length. Types<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Types +// will appear as Types in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Types, and Google Test will translate +// that to TypesN internally to make error messages +// readable. The translation is done by the 'type' member of the +// Types template. +template +struct Types { + typedef internal::Types50 type; +}; + +template <> +struct Types { + typedef internal::Types0 type; +}; +template +struct Types { + typedef internal::Types1 type; +}; +template +struct Types { + typedef internal::Types2 type; +}; +template +struct Types { + typedef internal::Types3 type; +}; +template +struct Types { + typedef internal::Types4 type; +}; +template +struct Types { + typedef internal::Types5 type; +}; +template +struct Types { + typedef internal::Types6 type; +}; +template +struct Types { + typedef internal::Types7 type; +}; +template +struct Types { + typedef internal::Types8 type; +}; +template +struct Types { + typedef internal::Types9 type; +}; +template +struct Types { + typedef internal::Types10 type; +}; +template +struct Types { + typedef internal::Types11 type; +}; +template +struct Types { + typedef internal::Types12 type; +}; +template +struct Types { + typedef internal::Types13 type; +}; +template +struct Types { + typedef internal::Types14 type; +}; +template +struct Types { + typedef internal::Types15 type; +}; +template +struct Types { + typedef internal::Types16 type; +}; +template +struct Types { + typedef internal::Types17 type; +}; +template +struct Types { + typedef internal::Types18 type; +}; +template +struct Types { + typedef internal::Types19 type; +}; +template +struct Types { + typedef internal::Types20 type; +}; +template +struct Types { + typedef internal::Types21 type; +}; +template +struct Types { + typedef internal::Types22 type; +}; +template +struct Types { + typedef internal::Types23 type; +}; +template +struct Types { + typedef internal::Types24 type; +}; +template +struct Types { + typedef internal::Types25 type; +}; +template +struct Types { + typedef internal::Types26 type; +}; +template +struct Types { + typedef internal::Types27 type; +}; +template +struct Types { + typedef internal::Types28 type; +}; +template +struct Types { + typedef internal::Types29 type; +}; +template +struct Types { + typedef internal::Types30 type; +}; +template +struct Types { + typedef internal::Types31 type; +}; +template +struct Types { + typedef internal::Types32 type; +}; +template +struct Types { + typedef internal::Types33 type; +}; +template +struct Types { + typedef internal::Types34 type; +}; +template +struct Types { + typedef internal::Types35 type; +}; +template +struct Types { + typedef internal::Types36 type; +}; +template +struct Types { + typedef internal::Types37 type; +}; +template +struct Types { + typedef internal::Types38 type; +}; +template +struct Types { + typedef internal::Types39 type; +}; +template +struct Types { + typedef internal::Types40 type; +}; +template +struct Types { + typedef internal::Types41 type; +}; +template +struct Types { + typedef internal::Types42 type; +}; +template +struct Types { + typedef internal::Types43 type; +}; +template +struct Types { + typedef internal::Types44 type; +}; +template +struct Types { + typedef internal::Types45 type; +}; +template +struct Types { + typedef internal::Types46 type; +}; +template +struct Types { + typedef internal::Types47 type; +}; +template +struct Types { + typedef internal::Types48 type; +}; +template +struct Types { + typedef internal::Types49 type; +}; + +namespace internal { + +# define GTEST_TEMPLATE_ template class + +// The template "selector" struct TemplateSel is used to +// represent Tmpl, which must be a class template with one type +// parameter, as a type. TemplateSel::Bind::type is defined +// as the type Tmpl. This allows us to actually instantiate the +// template "selected" by TemplateSel. +// +// This trick is necessary for simulating typedef for class templates, +// which C++ doesn't support directly. +template +struct TemplateSel { + template + struct Bind { + typedef Tmpl type; + }; +}; + +# define GTEST_BIND_(TmplSel, T) \ + TmplSel::template Bind::type + +// A unique struct template used as the default value for the +// arguments of class template Templates. This allows us to simulate +// variadic templates (e.g. Templates, Templates, +// and etc), which C++ doesn't support directly. +template +struct NoneT {}; + +// The following family of struct and struct templates are used to +// represent template lists. In particular, TemplatesN represents a list of N templates (T1, T2, ..., and TN). Except +// for Templates0, every struct in the family has two member types: +// Head for the selector of the first template in the list, and Tail +// for the rest of the list. + +// The empty template list. +struct Templates0 {}; + +// Template lists of length 1, 2, 3, and so on. + +template +struct Templates1 { + typedef TemplateSel Head; + typedef Templates0 Tail; +}; +template +struct Templates2 { + typedef TemplateSel Head; + typedef Templates1 Tail; +}; + +template +struct Templates3 { + typedef TemplateSel Head; + typedef Templates2 Tail; +}; + +template +struct Templates4 { + typedef TemplateSel Head; + typedef Templates3 Tail; +}; + +template +struct Templates5 { + typedef TemplateSel Head; + typedef Templates4 Tail; +}; + +template +struct Templates6 { + typedef TemplateSel Head; + typedef Templates5 Tail; +}; + +template +struct Templates7 { + typedef TemplateSel Head; + typedef Templates6 Tail; +}; + +template +struct Templates8 { + typedef TemplateSel Head; + typedef Templates7 Tail; +}; + +template +struct Templates9 { + typedef TemplateSel Head; + typedef Templates8 Tail; +}; + +template +struct Templates10 { + typedef TemplateSel Head; + typedef Templates9 Tail; +}; + +template +struct Templates11 { + typedef TemplateSel Head; + typedef Templates10 Tail; +}; + +template +struct Templates12 { + typedef TemplateSel Head; + typedef Templates11 Tail; +}; + +template +struct Templates13 { + typedef TemplateSel Head; + typedef Templates12 Tail; +}; + +template +struct Templates14 { + typedef TemplateSel Head; + typedef Templates13 Tail; +}; + +template +struct Templates15 { + typedef TemplateSel Head; + typedef Templates14 Tail; +}; + +template +struct Templates16 { + typedef TemplateSel Head; + typedef Templates15 Tail; +}; + +template +struct Templates17 { + typedef TemplateSel Head; + typedef Templates16 Tail; +}; + +template +struct Templates18 { + typedef TemplateSel Head; + typedef Templates17 Tail; +}; + +template +struct Templates19 { + typedef TemplateSel Head; + typedef Templates18 Tail; +}; + +template +struct Templates20 { + typedef TemplateSel Head; + typedef Templates19 Tail; +}; + +template +struct Templates21 { + typedef TemplateSel Head; + typedef Templates20 Tail; +}; + +template +struct Templates22 { + typedef TemplateSel Head; + typedef Templates21 Tail; +}; + +template +struct Templates23 { + typedef TemplateSel Head; + typedef Templates22 Tail; +}; + +template +struct Templates24 { + typedef TemplateSel Head; + typedef Templates23 Tail; +}; + +template +struct Templates25 { + typedef TemplateSel Head; + typedef Templates24 Tail; +}; + +template +struct Templates26 { + typedef TemplateSel Head; + typedef Templates25 Tail; +}; + +template +struct Templates27 { + typedef TemplateSel Head; + typedef Templates26 Tail; +}; + +template +struct Templates28 { + typedef TemplateSel Head; + typedef Templates27 Tail; +}; + +template +struct Templates29 { + typedef TemplateSel Head; + typedef Templates28 Tail; +}; + +template +struct Templates30 { + typedef TemplateSel Head; + typedef Templates29 Tail; +}; + +template +struct Templates31 { + typedef TemplateSel Head; + typedef Templates30 Tail; +}; + +template +struct Templates32 { + typedef TemplateSel Head; + typedef Templates31 Tail; +}; + +template +struct Templates33 { + typedef TemplateSel Head; + typedef Templates32 Tail; +}; + +template +struct Templates34 { + typedef TemplateSel Head; + typedef Templates33 Tail; +}; + +template +struct Templates35 { + typedef TemplateSel Head; + typedef Templates34 Tail; +}; + +template +struct Templates36 { + typedef TemplateSel Head; + typedef Templates35 Tail; +}; + +template +struct Templates37 { + typedef TemplateSel Head; + typedef Templates36 Tail; +}; + +template +struct Templates38 { + typedef TemplateSel Head; + typedef Templates37 Tail; +}; + +template +struct Templates39 { + typedef TemplateSel Head; + typedef Templates38 Tail; +}; + +template +struct Templates40 { + typedef TemplateSel Head; + typedef Templates39 Tail; +}; + +template +struct Templates41 { + typedef TemplateSel Head; + typedef Templates40 Tail; +}; + +template +struct Templates42 { + typedef TemplateSel Head; + typedef Templates41 Tail; +}; + +template +struct Templates43 { + typedef TemplateSel Head; + typedef Templates42 Tail; +}; + +template +struct Templates44 { + typedef TemplateSel Head; + typedef Templates43 Tail; +}; + +template +struct Templates45 { + typedef TemplateSel Head; + typedef Templates44 Tail; +}; + +template +struct Templates46 { + typedef TemplateSel Head; + typedef Templates45 Tail; +}; + +template +struct Templates47 { + typedef TemplateSel Head; + typedef Templates46 Tail; +}; + +template +struct Templates48 { + typedef TemplateSel Head; + typedef Templates47 Tail; +}; + +template +struct Templates49 { + typedef TemplateSel Head; + typedef Templates48 Tail; +}; + +template +struct Templates50 { + typedef TemplateSel Head; + typedef Templates49 Tail; +}; + + +// We don't want to require the users to write TemplatesN<...> directly, +// as that would require them to count the length. Templates<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Templates +// will appear as Templates in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Templates, and Google Test will translate +// that to TemplatesN internally to make error messages +// readable. The translation is done by the 'type' member of the +// Templates template. +template +struct Templates { + typedef Templates50 type; +}; + +template <> +struct Templates { + typedef Templates0 type; +}; +template +struct Templates { + typedef Templates1 type; +}; +template +struct Templates { + typedef Templates2 type; +}; +template +struct Templates { + typedef Templates3 type; +}; +template +struct Templates { + typedef Templates4 type; +}; +template +struct Templates { + typedef Templates5 type; +}; +template +struct Templates { + typedef Templates6 type; +}; +template +struct Templates { + typedef Templates7 type; +}; +template +struct Templates { + typedef Templates8 type; +}; +template +struct Templates { + typedef Templates9 type; +}; +template +struct Templates { + typedef Templates10 type; +}; +template +struct Templates { + typedef Templates11 type; +}; +template +struct Templates { + typedef Templates12 type; +}; +template +struct Templates { + typedef Templates13 type; +}; +template +struct Templates { + typedef Templates14 type; +}; +template +struct Templates { + typedef Templates15 type; +}; +template +struct Templates { + typedef Templates16 type; +}; +template +struct Templates { + typedef Templates17 type; +}; +template +struct Templates { + typedef Templates18 type; +}; +template +struct Templates { + typedef Templates19 type; +}; +template +struct Templates { + typedef Templates20 type; +}; +template +struct Templates { + typedef Templates21 type; +}; +template +struct Templates { + typedef Templates22 type; +}; +template +struct Templates { + typedef Templates23 type; +}; +template +struct Templates { + typedef Templates24 type; +}; +template +struct Templates { + typedef Templates25 type; +}; +template +struct Templates { + typedef Templates26 type; +}; +template +struct Templates { + typedef Templates27 type; +}; +template +struct Templates { + typedef Templates28 type; +}; +template +struct Templates { + typedef Templates29 type; +}; +template +struct Templates { + typedef Templates30 type; +}; +template +struct Templates { + typedef Templates31 type; +}; +template +struct Templates { + typedef Templates32 type; +}; +template +struct Templates { + typedef Templates33 type; +}; +template +struct Templates { + typedef Templates34 type; +}; +template +struct Templates { + typedef Templates35 type; +}; +template +struct Templates { + typedef Templates36 type; +}; +template +struct Templates { + typedef Templates37 type; +}; +template +struct Templates { + typedef Templates38 type; +}; +template +struct Templates { + typedef Templates39 type; +}; +template +struct Templates { + typedef Templates40 type; +}; +template +struct Templates { + typedef Templates41 type; +}; +template +struct Templates { + typedef Templates42 type; +}; +template +struct Templates { + typedef Templates43 type; +}; +template +struct Templates { + typedef Templates44 type; +}; +template +struct Templates { + typedef Templates45 type; +}; +template +struct Templates { + typedef Templates46 type; +}; +template +struct Templates { + typedef Templates47 type; +}; +template +struct Templates { + typedef Templates48 type; +}; +template +struct Templates { + typedef Templates49 type; +}; + +// The TypeList template makes it possible to use either a single type +// or a Types<...> list in TYPED_TEST_CASE() and +// INSTANTIATE_TYPED_TEST_CASE_P(). + +template +struct TypeList { + typedef Types1 type; +}; + +template +struct TypeList > { + typedef typename Types::type type; +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump new file mode 100644 index 0000000000..251fdf025b --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump @@ -0,0 +1,297 @@ +$$ -*- mode: c++; -*- +$var n = 50 $$ Maximum length of type lists we want to support. +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Type utilities needed for implementing typed and type-parameterized +// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently we support at most $n types in a list, and at most $n +// type-parameterized tests in one type-parameterized test case. +// Please contact googletestframework@googlegroups.com if you need +// more. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + +#include "gtest/internal/gtest-port.h" + +// #ifdef __GNUC__ is too general here. It is possible to use gcc without using +// libstdc++ (which is where cxxabi.h comes from). +# if GTEST_HAS_CXXABI_H_ +# include +# elif defined(__HP_aCC) +# include +# endif // GTEST_HASH_CXXABI_H_ + +namespace testing { +namespace internal { + +// GetTypeName() returns a human-readable name of type T. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template +std::string GetTypeName() { +# if GTEST_HAS_RTTI + + const char* const name = typeid(T).name(); +# if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) + int status = 0; + // gcc's implementation of typeid(T).name() mangles the type name, + // so we have to demangle it. +# if GTEST_HAS_CXXABI_H_ + using abi::__cxa_demangle; +# endif // GTEST_HAS_CXXABI_H_ + char* const readable_name = __cxa_demangle(name, 0, 0, &status); + const std::string name_str(status == 0 ? readable_name : name); + free(readable_name); + return name_str; +# else + return name; +# endif // GTEST_HAS_CXXABI_H_ || __HP_aCC + +# else + + return ""; + +# endif // GTEST_HAS_RTTI +} + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// AssertyTypeEq::type is defined iff T1 and T2 are the same +// type. This can be used as a compile-time assertion to ensure that +// two types are equal. + +template +struct AssertTypeEq; + +template +struct AssertTypeEq { + typedef bool type; +}; + +// A unique type used as the default value for the arguments of class +// template Types. This allows us to simulate variadic templates +// (e.g. Types, Type, and etc), which C++ doesn't +// support directly. +struct None {}; + +// The following family of struct and struct templates are used to +// represent type lists. In particular, TypesN +// represents a type list with N types (T1, T2, ..., and TN) in it. +// Except for Types0, every struct in the family has two member types: +// Head for the first type in the list, and Tail for the rest of the +// list. + +// The empty type list. +struct Types0 {}; + +// Type lists of length 1, 2, 3, and so on. + +template +struct Types1 { + typedef T1 Head; + typedef Types0 Tail; +}; + +$range i 2..n + +$for i [[ +$range j 1..i +$range k 2..i +template <$for j, [[typename T$j]]> +struct Types$i { + typedef T1 Head; + typedef Types$(i-1)<$for k, [[T$k]]> Tail; +}; + + +]] + +} // namespace internal + +// We don't want to require the users to write TypesN<...> directly, +// as that would require them to count the length. Types<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Types +// will appear as Types in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Types, and Google Test will translate +// that to TypesN internally to make error messages +// readable. The translation is done by the 'type' member of the +// Types template. + +$range i 1..n +template <$for i, [[typename T$i = internal::None]]> +struct Types { + typedef internal::Types$n<$for i, [[T$i]]> type; +}; + +template <> +struct Types<$for i, [[internal::None]]> { + typedef internal::Types0 type; +}; + +$range i 1..n-1 +$for i [[ +$range j 1..i +$range k i+1..n +template <$for j, [[typename T$j]]> +struct Types<$for j, [[T$j]]$for k[[, internal::None]]> { + typedef internal::Types$i<$for j, [[T$j]]> type; +}; + +]] + +namespace internal { + +# define GTEST_TEMPLATE_ template class + +// The template "selector" struct TemplateSel is used to +// represent Tmpl, which must be a class template with one type +// parameter, as a type. TemplateSel::Bind::type is defined +// as the type Tmpl. This allows us to actually instantiate the +// template "selected" by TemplateSel. +// +// This trick is necessary for simulating typedef for class templates, +// which C++ doesn't support directly. +template +struct TemplateSel { + template + struct Bind { + typedef Tmpl type; + }; +}; + +# define GTEST_BIND_(TmplSel, T) \ + TmplSel::template Bind::type + +// A unique struct template used as the default value for the +// arguments of class template Templates. This allows us to simulate +// variadic templates (e.g. Templates, Templates, +// and etc), which C++ doesn't support directly. +template +struct NoneT {}; + +// The following family of struct and struct templates are used to +// represent template lists. In particular, TemplatesN represents a list of N templates (T1, T2, ..., and TN). Except +// for Templates0, every struct in the family has two member types: +// Head for the selector of the first template in the list, and Tail +// for the rest of the list. + +// The empty template list. +struct Templates0 {}; + +// Template lists of length 1, 2, 3, and so on. + +template +struct Templates1 { + typedef TemplateSel Head; + typedef Templates0 Tail; +}; + +$range i 2..n + +$for i [[ +$range j 1..i +$range k 2..i +template <$for j, [[GTEST_TEMPLATE_ T$j]]> +struct Templates$i { + typedef TemplateSel Head; + typedef Templates$(i-1)<$for k, [[T$k]]> Tail; +}; + + +]] + +// We don't want to require the users to write TemplatesN<...> directly, +// as that would require them to count the length. Templates<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Templates +// will appear as Templates in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Templates, and Google Test will translate +// that to TemplatesN internally to make error messages +// readable. The translation is done by the 'type' member of the +// Templates template. + +$range i 1..n +template <$for i, [[GTEST_TEMPLATE_ T$i = NoneT]]> +struct Templates { + typedef Templates$n<$for i, [[T$i]]> type; +}; + +template <> +struct Templates<$for i, [[NoneT]]> { + typedef Templates0 type; +}; + +$range i 1..n-1 +$for i [[ +$range j 1..i +$range k i+1..n +template <$for j, [[GTEST_TEMPLATE_ T$j]]> +struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> { + typedef Templates$i<$for j, [[T$j]]> type; +}; + +]] + +// The TypeList template makes it possible to use either a single type +// or a Types<...> list in TYPED_TEST_CASE() and +// INSTANTIATE_TYPED_TEST_CASE_P(). + +template +struct TypeList { + typedef Types1 type; +}; + + +$range i 1..n +template <$for i, [[typename T$i]]> +struct TypeList > { + typedef typename Types<$for i, [[T$i]]>::type type; +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc index 912868148e..0a9cee5223 100644 --- a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc +++ b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc @@ -39,9554 +39,10 @@ #include "gtest/gtest.h" // The following lines pull in the real gtest *.cc files. -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) - -// Copyright 2007, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// Utilities for testing Google Test itself and code that uses Google Test -// (e.g. frameworks built on top of Google Test). - -#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ -#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ - - -namespace testing { - -// This helper class can be used to mock out Google Test failure reporting -// so that we can test Google Test or code that builds on Google Test. -// -// An object of this class appends a TestPartResult object to the -// TestPartResultArray object given in the constructor whenever a Google Test -// failure is reported. It can either intercept only failures that are -// generated in the same thread that created this object or it can intercept -// all generated failures. The scope of this mock object can be controlled with -// the second argument to the two arguments constructor. -class GTEST_API_ ScopedFakeTestPartResultReporter - : public TestPartResultReporterInterface { - public: - // The two possible mocking modes of this object. - enum InterceptMode { - INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures. - INTERCEPT_ALL_THREADS // Intercepts all failures. - }; - - // The c'tor sets this object as the test part result reporter used - // by Google Test. The 'result' parameter specifies where to report the - // results. This reporter will only catch failures generated in the current - // thread. DEPRECATED - explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result); - - // Same as above, but you can choose the interception scope of this object. - ScopedFakeTestPartResultReporter(InterceptMode intercept_mode, - TestPartResultArray* result); - - // The d'tor restores the previous test part result reporter. - virtual ~ScopedFakeTestPartResultReporter(); - - // Appends the TestPartResult object to the TestPartResultArray - // received in the constructor. - // - // This method is from the TestPartResultReporterInterface - // interface. - virtual void ReportTestPartResult(const TestPartResult& result); - private: - void Init(); - - const InterceptMode intercept_mode_; - TestPartResultReporterInterface* old_reporter_; - TestPartResultArray* const result_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter); -}; - -namespace internal { - -// A helper class for implementing EXPECT_FATAL_FAILURE() and -// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given -// TestPartResultArray contains exactly one failure that has the given -// type and contains the given substring. If that's not the case, a -// non-fatal failure will be generated. -class GTEST_API_ SingleFailureChecker { - public: - // The constructor remembers the arguments. - SingleFailureChecker(const TestPartResultArray* results, - TestPartResult::Type type, - const string& substr); - ~SingleFailureChecker(); - private: - const TestPartResultArray* const results_; - const TestPartResult::Type type_; - const string substr_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); -}; - -} // namespace internal - -} // namespace testing - -// A set of macros for testing Google Test assertions or code that's expected -// to generate Google Test fatal failures. It verifies that the given -// statement will cause exactly one fatal Google Test failure with 'substr' -// being part of the failure message. -// -// There are two different versions of this macro. EXPECT_FATAL_FAILURE only -// affects and considers failures generated in the current thread and -// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. -// -// The verification of the assertion is done correctly even when the statement -// throws an exception or aborts the current function. -// -// Known restrictions: -// - 'statement' cannot reference local non-static variables or -// non-static members of the current object. -// - 'statement' cannot return a value. -// - You cannot stream a failure message to this macro. -// -// Note that even though the implementations of the following two -// macros are much alike, we cannot refactor them to use a common -// helper macro, due to some peculiarity in how the preprocessor -// works. The AcceptsMacroThatExpandsToUnprotectedComma test in -// gtest_unittest.cc will fail to compile if we do that. -#define EXPECT_FATAL_FAILURE(statement, substr) \ - do { \ - class GTestExpectFatalFailureHelper {\ - public:\ - static void Execute() { statement; }\ - };\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ - GTestExpectFatalFailureHelper::Execute();\ - }\ - } while (::testing::internal::AlwaysFalse()) - -#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ - do { \ - class GTestExpectFatalFailureHelper {\ - public:\ - static void Execute() { statement; }\ - };\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ALL_THREADS, >est_failures);\ - GTestExpectFatalFailureHelper::Execute();\ - }\ - } while (::testing::internal::AlwaysFalse()) - -// A macro for testing Google Test assertions or code that's expected to -// generate Google Test non-fatal failures. It asserts that the given -// statement will cause exactly one non-fatal Google Test failure with 'substr' -// being part of the failure message. -// -// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only -// affects and considers failures generated in the current thread and -// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. -// -// 'statement' is allowed to reference local variables and members of -// the current object. -// -// The verification of the assertion is done correctly even when the statement -// throws an exception or aborts the current function. -// -// Known restrictions: -// - You cannot stream a failure message to this macro. -// -// Note that even though the implementations of the following two -// macros are much alike, we cannot refactor them to use a common -// helper macro, due to some peculiarity in how the preprocessor -// works. If we do that, the code won't compile when the user gives -// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that -// expands to code containing an unprotected comma. The -// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc -// catches that. -// -// For the same reason, we have to write -// if (::testing::internal::AlwaysTrue()) { statement; } -// instead of -// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) -// to avoid an MSVC warning on unreachable code. -#define EXPECT_NONFATAL_FAILURE(statement, substr) \ - do {\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ - (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ - if (::testing::internal::AlwaysTrue()) { statement; }\ - }\ - } while (::testing::internal::AlwaysFalse()) - -#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ - do {\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ - (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ - >est_failures);\ - if (::testing::internal::AlwaysTrue()) { statement; }\ - }\ - } while (::testing::internal::AlwaysFalse()) - -#endif // GTEST_INCLUDE_GTEST_GTEST_SPI_H_ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include // NOLINT -#include -#include - -#if GTEST_OS_LINUX - -// TODO(kenton@google.com): Use autoconf to detect availability of -// gettimeofday(). -# define GTEST_HAS_GETTIMEOFDAY_ 1 - -# include // NOLINT -# include // NOLINT -# include // NOLINT -// Declares vsnprintf(). This header is not available on Windows. -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include - -#elif GTEST_OS_SYMBIAN -# define GTEST_HAS_GETTIMEOFDAY_ 1 -# include // NOLINT - -#elif GTEST_OS_ZOS -# define GTEST_HAS_GETTIMEOFDAY_ 1 -# include // NOLINT - -// On z/OS we additionally need strings.h for strcasecmp. -# include // NOLINT - -#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. - -# include // NOLINT - -#elif GTEST_OS_WINDOWS // We are on Windows proper. - -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include // NOLINT - -# if GTEST_OS_WINDOWS_MINGW -// MinGW has gettimeofday() but not _ftime64(). -// TODO(kenton@google.com): Use autoconf to detect availability of -// gettimeofday(). -// TODO(kenton@google.com): There are other ways to get the time on -// Windows, like GetTickCount() or GetSystemTimeAsFileTime(). MinGW -// supports these. consider using them instead. -# define GTEST_HAS_GETTIMEOFDAY_ 1 -# include // NOLINT -# endif // GTEST_OS_WINDOWS_MINGW - -// cpplint thinks that the header is already included, so we want to -// silence it. -# include // NOLINT - -#else - -// Assume other platforms have gettimeofday(). -// TODO(kenton@google.com): Use autoconf to detect availability of -// gettimeofday(). -# define GTEST_HAS_GETTIMEOFDAY_ 1 - -// cpplint thinks that the header is already included, so we want to -// silence it. -# include // NOLINT -# include // NOLINT - -#endif // GTEST_OS_LINUX - -#if GTEST_HAS_EXCEPTIONS -# include -#endif - -#if GTEST_CAN_STREAM_RESULTS_ -# include // NOLINT -# include // NOLINT -#endif - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// Utility functions and classes used by the Google C++ testing framework. -// -// Author: wan@google.com (Zhanyong Wan) -// -// This file contains purely Google Test's internal implementation. Please -// DO NOT #INCLUDE IT IN A USER PROGRAM. - -#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_ -#define GTEST_SRC_GTEST_INTERNAL_INL_H_ - -// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is -// part of Google Test's implementation; otherwise it's undefined. -#if !GTEST_IMPLEMENTATION_ -// A user is trying to include this from his code - just say no. -# error "gtest-internal-inl.h is part of Google Test's internal implementation." -# error "It must not be included except by Google Test itself." -#endif // GTEST_IMPLEMENTATION_ - -#ifndef _WIN32_WCE -# include -#endif // !_WIN32_WCE -#include -#include // For strtoll/_strtoul64/malloc/free. -#include // For memmove. - -#include -#include -#include - - -#if GTEST_CAN_STREAM_RESULTS_ -# include // NOLINT -# include // NOLINT -#endif - -#if GTEST_OS_WINDOWS -# include // NOLINT -#endif // GTEST_OS_WINDOWS - - -namespace testing { - -// Declares the flags. -// -// We don't want the users to modify this flag in the code, but want -// Google Test's own unit tests to be able to access it. Therefore we -// declare it here as opposed to in gtest.h. -GTEST_DECLARE_bool_(death_test_use_fork); - -namespace internal { - -// The value of GetTestTypeId() as seen from within the Google Test -// library. This is solely for testing GetTestTypeId(). -GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; - -// Names of the flags (needed for parsing Google Test flags). -const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; -const char kBreakOnFailureFlag[] = "break_on_failure"; -const char kCatchExceptionsFlag[] = "catch_exceptions"; -const char kColorFlag[] = "color"; -const char kFilterFlag[] = "filter"; -const char kListTestsFlag[] = "list_tests"; -const char kOutputFlag[] = "output"; -const char kPrintTimeFlag[] = "print_time"; -const char kRandomSeedFlag[] = "random_seed"; -const char kRepeatFlag[] = "repeat"; -const char kShuffleFlag[] = "shuffle"; -const char kStackTraceDepthFlag[] = "stack_trace_depth"; -const char kStreamResultToFlag[] = "stream_result_to"; -const char kThrowOnFailureFlag[] = "throw_on_failure"; - -// A valid random seed must be in [1, kMaxRandomSeed]. -const int kMaxRandomSeed = 99999; - -// g_help_flag is true iff the --help flag or an equivalent form is -// specified on the command line. -GTEST_API_ extern bool g_help_flag; - -// Returns the current time in milliseconds. -GTEST_API_ TimeInMillis GetTimeInMillis(); - -// Returns true iff Google Test should use colors in the output. -GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); - -// Formats the given time in milliseconds as seconds. -GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms); - -// Converts the given time in milliseconds to a date string in the ISO 8601 -// format, without the timezone information. N.B.: due to the use the -// non-reentrant localtime() function, this function is not thread safe. Do -// not use it in any code that can be called from multiple threads. -GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); - -// Parses a string for an Int32 flag, in the form of "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -GTEST_API_ bool ParseInt32Flag( - const char* str, const char* flag, Int32* value); - -// Returns a random seed in range [1, kMaxRandomSeed] based on the -// given --gtest_random_seed flag value. -inline int GetRandomSeedFromFlag(Int32 random_seed_flag) { - const unsigned int raw_seed = (random_seed_flag == 0) ? - static_cast(GetTimeInMillis()) : - static_cast(random_seed_flag); - - // Normalizes the actual seed to range [1, kMaxRandomSeed] such that - // it's easy to type. - const int normalized_seed = - static_cast((raw_seed - 1U) % - static_cast(kMaxRandomSeed)) + 1; - return normalized_seed; -} - -// Returns the first valid random seed after 'seed'. The behavior is -// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is -// considered to be 1. -inline int GetNextRandomSeed(int seed) { - GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed) - << "Invalid random seed " << seed << " - must be in [1, " - << kMaxRandomSeed << "]."; - const int next_seed = seed + 1; - return (next_seed > kMaxRandomSeed) ? 1 : next_seed; -} - -// This class saves the values of all Google Test flags in its c'tor, and -// restores them in its d'tor. -class GTestFlagSaver { - public: - // The c'tor. - GTestFlagSaver() { - also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests); - break_on_failure_ = GTEST_FLAG(break_on_failure); - catch_exceptions_ = GTEST_FLAG(catch_exceptions); - color_ = GTEST_FLAG(color); - death_test_style_ = GTEST_FLAG(death_test_style); - death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); - filter_ = GTEST_FLAG(filter); - internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); - list_tests_ = GTEST_FLAG(list_tests); - output_ = GTEST_FLAG(output); - print_time_ = GTEST_FLAG(print_time); - random_seed_ = GTEST_FLAG(random_seed); - repeat_ = GTEST_FLAG(repeat); - shuffle_ = GTEST_FLAG(shuffle); - stack_trace_depth_ = GTEST_FLAG(stack_trace_depth); - stream_result_to_ = GTEST_FLAG(stream_result_to); - throw_on_failure_ = GTEST_FLAG(throw_on_failure); - } - - // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. - ~GTestFlagSaver() { - GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_; - GTEST_FLAG(break_on_failure) = break_on_failure_; - GTEST_FLAG(catch_exceptions) = catch_exceptions_; - GTEST_FLAG(color) = color_; - GTEST_FLAG(death_test_style) = death_test_style_; - GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; - GTEST_FLAG(filter) = filter_; - GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; - GTEST_FLAG(list_tests) = list_tests_; - GTEST_FLAG(output) = output_; - GTEST_FLAG(print_time) = print_time_; - GTEST_FLAG(random_seed) = random_seed_; - GTEST_FLAG(repeat) = repeat_; - GTEST_FLAG(shuffle) = shuffle_; - GTEST_FLAG(stack_trace_depth) = stack_trace_depth_; - GTEST_FLAG(stream_result_to) = stream_result_to_; - GTEST_FLAG(throw_on_failure) = throw_on_failure_; - } - - private: - // Fields for saving the original values of flags. - bool also_run_disabled_tests_; - bool break_on_failure_; - bool catch_exceptions_; - std::string color_; - std::string death_test_style_; - bool death_test_use_fork_; - std::string filter_; - std::string internal_run_death_test_; - bool list_tests_; - std::string output_; - bool print_time_; - internal::Int32 random_seed_; - internal::Int32 repeat_; - bool shuffle_; - internal::Int32 stack_trace_depth_; - std::string stream_result_to_; - bool throw_on_failure_; -} GTEST_ATTRIBUTE_UNUSED_; - -// Converts a Unicode code point to a narrow string in UTF-8 encoding. -// code_point parameter is of type UInt32 because wchar_t may not be -// wide enough to contain a code point. -// If the code_point is not a valid Unicode code point -// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted -// to "(Invalid Unicode 0xXXXXXXXX)". -GTEST_API_ std::string CodePointToUtf8(UInt32 code_point); - -// Converts a wide string to a narrow string in UTF-8 encoding. -// The wide string is assumed to have the following encoding: -// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) -// UTF-32 if sizeof(wchar_t) == 4 (on Linux) -// Parameter str points to a null-terminated wide string. -// Parameter num_chars may additionally limit the number -// of wchar_t characters processed. -1 is used when the entire string -// should be processed. -// If the string contains code points that are not valid Unicode code points -// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output -// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding -// and contains invalid UTF-16 surrogate pairs, values in those pairs -// will be encoded as individual Unicode characters from Basic Normal Plane. -GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars); - -// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file -// if the variable is present. If a file already exists at this location, this -// function will write over it. If the variable is present, but the file cannot -// be created, prints an error and exits. -void WriteToShardStatusFileIfNeeded(); - -// Checks whether sharding is enabled by examining the relevant -// environment variable values. If the variables are present, -// but inconsistent (e.g., shard_index >= total_shards), prints -// an error and exits. If in_subprocess_for_death_test, sharding is -// disabled because it must only be applied to the original test -// process. Otherwise, we could filter out death tests we intended to execute. -GTEST_API_ bool ShouldShard(const char* total_shards_str, - const char* shard_index_str, - bool in_subprocess_for_death_test); - -// Parses the environment variable var as an Int32. If it is unset, -// returns default_val. If it is not an Int32, prints an error and -// and aborts. -GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val); - -// Given the total number of shards, the shard index, and the test id, -// returns true iff the test should be run on this shard. The test id is -// some arbitrary but unique non-negative integer assigned to each test -// method. Assumes that 0 <= shard_index < total_shards. -GTEST_API_ bool ShouldRunTestOnShard( - int total_shards, int shard_index, int test_id); - -// STL container utilities. - -// Returns the number of elements in the given container that satisfy -// the given predicate. -template -inline int CountIf(const Container& c, Predicate predicate) { - // Implemented as an explicit loop since std::count_if() in libCstd on - // Solaris has a non-standard signature. - int count = 0; - for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) { - if (predicate(*it)) - ++count; - } - return count; -} - -// Applies a function/functor to each element in the container. -template -void ForEach(const Container& c, Functor functor) { - std::for_each(c.begin(), c.end(), functor); -} - -// Returns the i-th element of the vector, or default_value if i is not -// in range [0, v.size()). -template -inline E GetElementOr(const std::vector& v, int i, E default_value) { - return (i < 0 || i >= static_cast(v.size())) ? default_value : v[i]; -} - -// Performs an in-place shuffle of a range of the vector's elements. -// 'begin' and 'end' are element indices as an STL-style range; -// i.e. [begin, end) are shuffled, where 'end' == size() means to -// shuffle to the end of the vector. -template -void ShuffleRange(internal::Random* random, int begin, int end, - std::vector* v) { - const int size = static_cast(v->size()); - GTEST_CHECK_(0 <= begin && begin <= size) - << "Invalid shuffle range start " << begin << ": must be in range [0, " - << size << "]."; - GTEST_CHECK_(begin <= end && end <= size) - << "Invalid shuffle range finish " << end << ": must be in range [" - << begin << ", " << size << "]."; - - // Fisher-Yates shuffle, from - // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle - for (int range_width = end - begin; range_width >= 2; range_width--) { - const int last_in_range = begin + range_width - 1; - const int selected = begin + random->Generate(range_width); - std::swap((*v)[selected], (*v)[last_in_range]); - } -} - -// Performs an in-place shuffle of the vector's elements. -template -inline void Shuffle(internal::Random* random, std::vector* v) { - ShuffleRange(random, 0, static_cast(v->size()), v); -} - -// A function for deleting an object. Handy for being used as a -// functor. -template -static void Delete(T* x) { - delete x; -} - -// A predicate that checks the key of a TestProperty against a known key. -// -// TestPropertyKeyIs is copyable. -class TestPropertyKeyIs { - public: - // Constructor. - // - // TestPropertyKeyIs has NO default constructor. - explicit TestPropertyKeyIs(const std::string& key) : key_(key) {} - - // Returns true iff the test name of test property matches on key_. - bool operator()(const TestProperty& test_property) const { - return test_property.key() == key_; - } - - private: - std::string key_; -}; - -// Class UnitTestOptions. -// -// This class contains functions for processing options the user -// specifies when running the tests. It has only static members. -// -// In most cases, the user can specify an option using either an -// environment variable or a command line flag. E.g. you can set the -// test filter using either GTEST_FILTER or --gtest_filter. If both -// the variable and the flag are present, the latter overrides the -// former. -class GTEST_API_ UnitTestOptions { - public: - // Functions for processing the gtest_output flag. - - // Returns the output format, or "" for normal printed output. - static std::string GetOutputFormat(); - - // Returns the absolute path of the requested output file, or the - // default (test_detail.xml in the original working directory) if - // none was explicitly specified. - static std::string GetAbsolutePathToOutputFile(); - - // Functions for processing the gtest_filter flag. - - // Returns true iff the wildcard pattern matches the string. The - // first ':' or '\0' character in pattern marks the end of it. - // - // This recursive algorithm isn't very efficient, but is clear and - // works well enough for matching test names, which are short. - static bool PatternMatchesString(const char *pattern, const char *str); - - // Returns true iff the user-specified filter matches the test case - // name and the test name. - static bool FilterMatchesTest(const std::string &test_case_name, - const std::string &test_name); - -#if GTEST_OS_WINDOWS - // Function for supporting the gtest_catch_exception flag. - - // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the - // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. - // This function is useful as an __except condition. - static int GTestShouldProcessSEH(DWORD exception_code); -#endif // GTEST_OS_WINDOWS - - // Returns true if "name" matches the ':' separated list of glob-style - // filters in "filter". - static bool MatchesFilter(const std::string& name, const char* filter); -}; - -// Returns the current application's name, removing directory path if that -// is present. Used by UnitTestOptions::GetOutputFile. -GTEST_API_ FilePath GetCurrentExecutableName(); - -// The role interface for getting the OS stack trace as a string. -class OsStackTraceGetterInterface { - public: - OsStackTraceGetterInterface() {} - virtual ~OsStackTraceGetterInterface() {} - - // Returns the current OS stack trace as an std::string. Parameters: - // - // max_depth - the maximum number of stack frames to be included - // in the trace. - // skip_count - the number of top frames to be skipped; doesn't count - // against max_depth. - virtual string CurrentStackTrace(int max_depth, int skip_count) = 0; - - // UponLeavingGTest() should be called immediately before Google Test calls - // user code. It saves some information about the current stack that - // CurrentStackTrace() will use to find and hide Google Test stack frames. - virtual void UponLeavingGTest() = 0; - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface); -}; - -// A working implementation of the OsStackTraceGetterInterface interface. -class OsStackTraceGetter : public OsStackTraceGetterInterface { - public: - OsStackTraceGetter() : caller_frame_(NULL) {} - - virtual string CurrentStackTrace(int max_depth, int skip_count) - GTEST_LOCK_EXCLUDED_(mutex_); - - virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_); - - // This string is inserted in place of stack frames that are part of - // Google Test's implementation. - static const char* const kElidedFramesMarker; - - private: - Mutex mutex_; // protects all internal state - - // We save the stack frame below the frame that calls user code. - // We do this because the address of the frame immediately below - // the user code changes between the call to UponLeavingGTest() - // and any calls to CurrentStackTrace() from within the user code. - void* caller_frame_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); -}; - -// Information about a Google Test trace point. -struct TraceInfo { - const char* file; - int line; - std::string message; -}; - -// This is the default global test part result reporter used in UnitTestImpl. -// This class should only be used by UnitTestImpl. -class DefaultGlobalTestPartResultReporter - : public TestPartResultReporterInterface { - public: - explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); - // Implements the TestPartResultReporterInterface. Reports the test part - // result in the current test. - virtual void ReportTestPartResult(const TestPartResult& result); - - private: - UnitTestImpl* const unit_test_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter); -}; - -// This is the default per thread test part result reporter used in -// UnitTestImpl. This class should only be used by UnitTestImpl. -class DefaultPerThreadTestPartResultReporter - : public TestPartResultReporterInterface { - public: - explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test); - // Implements the TestPartResultReporterInterface. The implementation just - // delegates to the current global test part result reporter of *unit_test_. - virtual void ReportTestPartResult(const TestPartResult& result); - - private: - UnitTestImpl* const unit_test_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter); -}; - -// The private implementation of the UnitTest class. We don't protect -// the methods under a mutex, as this class is not accessible by a -// user and the UnitTest class that delegates work to this class does -// proper locking. -class GTEST_API_ UnitTestImpl { - public: - explicit UnitTestImpl(UnitTest* parent); - virtual ~UnitTestImpl(); - - // There are two different ways to register your own TestPartResultReporter. - // You can register your own repoter to listen either only for test results - // from the current thread or for results from all threads. - // By default, each per-thread test result repoter just passes a new - // TestPartResult to the global test result reporter, which registers the - // test part result for the currently running test. - - // Returns the global test part result reporter. - TestPartResultReporterInterface* GetGlobalTestPartResultReporter(); - - // Sets the global test part result reporter. - void SetGlobalTestPartResultReporter( - TestPartResultReporterInterface* reporter); - - // Returns the test part result reporter for the current thread. - TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread(); - - // Sets the test part result reporter for the current thread. - void SetTestPartResultReporterForCurrentThread( - TestPartResultReporterInterface* reporter); - - // Gets the number of successful test cases. - int successful_test_case_count() const; - - // Gets the number of failed test cases. - int failed_test_case_count() const; - - // Gets the number of all test cases. - int total_test_case_count() const; - - // Gets the number of all test cases that contain at least one test - // that should run. - int test_case_to_run_count() const; - - // Gets the number of successful tests. - int successful_test_count() const; - - // Gets the number of failed tests. - int failed_test_count() const; - - // Gets the number of disabled tests that will be reported in the XML report. - int reportable_disabled_test_count() const; - - // Gets the number of disabled tests. - int disabled_test_count() const; - - // Gets the number of tests to be printed in the XML report. - int reportable_test_count() const; - - // Gets the number of all tests. - int total_test_count() const; - - // Gets the number of tests that should run. - int test_to_run_count() const; - - // Gets the time of the test program start, in ms from the start of the - // UNIX epoch. - TimeInMillis start_timestamp() const { return start_timestamp_; } - - // Gets the elapsed time, in milliseconds. - TimeInMillis elapsed_time() const { return elapsed_time_; } - - // Returns true iff the unit test passed (i.e. all test cases passed). - bool Passed() const { return !Failed(); } - - // Returns true iff the unit test failed (i.e. some test case failed - // or something outside of all tests failed). - bool Failed() const { - return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed(); - } - - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. - const TestCase* GetTestCase(int i) const { - const int index = GetElementOr(test_case_indices_, i, -1); - return index < 0 ? NULL : test_cases_[i]; - } - - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. - TestCase* GetMutableTestCase(int i) { - const int index = GetElementOr(test_case_indices_, i, -1); - return index < 0 ? NULL : test_cases_[index]; - } - - // Provides access to the event listener list. - TestEventListeners* listeners() { return &listeners_; } - - // Returns the TestResult for the test that's currently running, or - // the TestResult for the ad hoc test if no test is running. - TestResult* current_test_result(); - - // Returns the TestResult for the ad hoc test. - const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; } - - // Sets the OS stack trace getter. - // - // Does nothing if the input and the current OS stack trace getter - // are the same; otherwise, deletes the old getter and makes the - // input the current getter. - void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter); - - // Returns the current OS stack trace getter if it is not NULL; - // otherwise, creates an OsStackTraceGetter, makes it the current - // getter, and returns it. - OsStackTraceGetterInterface* os_stack_trace_getter(); - - // Returns the current OS stack trace as an std::string. - // - // The maximum number of stack frames to be included is specified by - // the gtest_stack_trace_depth flag. The skip_count parameter - // specifies the number of top frames to be skipped, which doesn't - // count against the number of frames to be included. - // - // For example, if Foo() calls Bar(), which in turn calls - // CurrentOsStackTraceExceptTop(1), Foo() will be included in the - // trace but Bar() and CurrentOsStackTraceExceptTop() won't. - std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_; - - // Finds and returns a TestCase with the given name. If one doesn't - // exist, creates one and returns it. - // - // Arguments: - // - // test_case_name: name of the test case - // type_param: the name of the test's type parameter, or NULL if - // this is not a typed or a type-parameterized test. - // set_up_tc: pointer to the function that sets up the test case - // tear_down_tc: pointer to the function that tears down the test case - TestCase* GetTestCase(const char* test_case_name, - const char* type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc); - - // Adds a TestInfo to the unit test. - // - // Arguments: - // - // set_up_tc: pointer to the function that sets up the test case - // tear_down_tc: pointer to the function that tears down the test case - // test_info: the TestInfo object - void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc, - TestInfo* test_info) { - // In order to support thread-safe death tests, we need to - // remember the original working directory when the test program - // was first invoked. We cannot do this in RUN_ALL_TESTS(), as - // the user may have changed the current directory before calling - // RUN_ALL_TESTS(). Therefore we capture the current directory in - // AddTestInfo(), which is called to register a TEST or TEST_F - // before main() is reached. - if (original_working_dir_.IsEmpty()) { - original_working_dir_.Set(FilePath::GetCurrentDir()); - GTEST_CHECK_(!original_working_dir_.IsEmpty()) - << "Failed to get the current working directory."; - } - - GetTestCase(test_info->test_case_name(), - test_info->type_param(), - set_up_tc, - tear_down_tc)->AddTestInfo(test_info); - } - -#if GTEST_HAS_PARAM_TEST - // Returns ParameterizedTestCaseRegistry object used to keep track of - // value-parameterized tests and instantiate and register them. - internal::ParameterizedTestCaseRegistry& parameterized_test_registry() { - return parameterized_test_registry_; - } -#endif // GTEST_HAS_PARAM_TEST - - // Sets the TestCase object for the test that's currently running. - void set_current_test_case(TestCase* a_current_test_case) { - current_test_case_ = a_current_test_case; - } - - // Sets the TestInfo object for the test that's currently running. If - // current_test_info is NULL, the assertion results will be stored in - // ad_hoc_test_result_. - void set_current_test_info(TestInfo* a_current_test_info) { - current_test_info_ = a_current_test_info; - } - - // Registers all parameterized tests defined using TEST_P and - // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter - // combination. This method can be called more then once; it has guards - // protecting from registering the tests more then once. If - // value-parameterized tests are disabled, RegisterParameterizedTests is - // present but does nothing. - void RegisterParameterizedTests(); - - // Runs all tests in this UnitTest object, prints the result, and - // returns true if all tests are successful. If any exception is - // thrown during a test, this test is considered to be failed, but - // the rest of the tests will still be run. - bool RunAllTests(); - - // Clears the results of all tests, except the ad hoc tests. - void ClearNonAdHocTestResult() { - ForEach(test_cases_, TestCase::ClearTestCaseResult); - } - - // Clears the results of ad-hoc test assertions. - void ClearAdHocTestResult() { - ad_hoc_test_result_.Clear(); - } - - // Adds a TestProperty to the current TestResult object when invoked in a - // context of a test or a test case, or to the global property set. If the - // result already contains a property with the same key, the value will be - // updated. - void RecordProperty(const TestProperty& test_property); - - enum ReactionToSharding { - HONOR_SHARDING_PROTOCOL, - IGNORE_SHARDING_PROTOCOL - }; - - // Matches the full name of each test against the user-specified - // filter to decide whether the test should run, then records the - // result in each TestCase and TestInfo object. - // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests - // based on sharding variables in the environment. - // Returns the number of tests that should run. - int FilterTests(ReactionToSharding shard_tests); - - // Prints the names of the tests matching the user-specified filter flag. - void ListTestsMatchingFilter(); - - const TestCase* current_test_case() const { return current_test_case_; } - TestInfo* current_test_info() { return current_test_info_; } - const TestInfo* current_test_info() const { return current_test_info_; } - - // Returns the vector of environments that need to be set-up/torn-down - // before/after the tests are run. - std::vector& environments() { return environments_; } - - // Getters for the per-thread Google Test trace stack. - std::vector& gtest_trace_stack() { - return *(gtest_trace_stack_.pointer()); - } - const std::vector& gtest_trace_stack() const { - return gtest_trace_stack_.get(); - } - -#if GTEST_HAS_DEATH_TEST - void InitDeathTestSubprocessControlInfo() { - internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag()); - } - // Returns a pointer to the parsed --gtest_internal_run_death_test - // flag, or NULL if that flag was not specified. - // This information is useful only in a death test child process. - // Must not be called before a call to InitGoogleTest. - const InternalRunDeathTestFlag* internal_run_death_test_flag() const { - return internal_run_death_test_flag_.get(); - } - - // Returns a pointer to the current death test factory. - internal::DeathTestFactory* death_test_factory() { - return death_test_factory_.get(); - } - - void SuppressTestEventsIfInSubprocess(); - - friend class ReplaceDeathTestFactory; -#endif // GTEST_HAS_DEATH_TEST - - // Initializes the event listener performing XML output as specified by - // UnitTestOptions. Must not be called before InitGoogleTest. - void ConfigureXmlOutput(); - -#if GTEST_CAN_STREAM_RESULTS_ - // Initializes the event listener for streaming test results to a socket. - // Must not be called before InitGoogleTest. - void ConfigureStreamingOutput(); -#endif - - // Performs initialization dependent upon flag values obtained in - // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to - // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest - // this function is also called from RunAllTests. Since this function can be - // called more than once, it has to be idempotent. - void PostFlagParsingInit(); - - // Gets the random seed used at the start of the current test iteration. - int random_seed() const { return random_seed_; } - - // Gets the random number generator. - internal::Random* random() { return &random_; } - - // Shuffles all test cases, and the tests within each test case, - // making sure that death tests are still run first. - void ShuffleTests(); - - // Restores the test cases and tests to their order before the first shuffle. - void UnshuffleTests(); - - // Returns the value of GTEST_FLAG(catch_exceptions) at the moment - // UnitTest::Run() starts. - bool catch_exceptions() const { return catch_exceptions_; } - - private: - friend class ::testing::UnitTest; - - // Used by UnitTest::Run() to capture the state of - // GTEST_FLAG(catch_exceptions) at the moment it starts. - void set_catch_exceptions(bool value) { catch_exceptions_ = value; } - - // The UnitTest object that owns this implementation object. - UnitTest* const parent_; - - // The working directory when the first TEST() or TEST_F() was - // executed. - internal::FilePath original_working_dir_; - - // The default test part result reporters. - DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_; - DefaultPerThreadTestPartResultReporter - default_per_thread_test_part_result_reporter_; - - // Points to (but doesn't own) the global test part result reporter. - TestPartResultReporterInterface* global_test_part_result_repoter_; - - // Protects read and write access to global_test_part_result_reporter_. - internal::Mutex global_test_part_result_reporter_mutex_; - - // Points to (but doesn't own) the per-thread test part result reporter. - internal::ThreadLocal - per_thread_test_part_result_reporter_; - - // The vector of environments that need to be set-up/torn-down - // before/after the tests are run. - std::vector environments_; - - // The vector of TestCases in their original order. It owns the - // elements in the vector. - std::vector test_cases_; - - // Provides a level of indirection for the test case list to allow - // easy shuffling and restoring the test case order. The i-th - // element of this vector is the index of the i-th test case in the - // shuffled order. - std::vector test_case_indices_; - -#if GTEST_HAS_PARAM_TEST - // ParameterizedTestRegistry object used to register value-parameterized - // tests. - internal::ParameterizedTestCaseRegistry parameterized_test_registry_; - - // Indicates whether RegisterParameterizedTests() has been called already. - bool parameterized_tests_registered_; -#endif // GTEST_HAS_PARAM_TEST - - // Index of the last death test case registered. Initially -1. - int last_death_test_case_; - - // This points to the TestCase for the currently running test. It - // changes as Google Test goes through one test case after another. - // When no test is running, this is set to NULL and Google Test - // stores assertion results in ad_hoc_test_result_. Initially NULL. - TestCase* current_test_case_; - - // This points to the TestInfo for the currently running test. It - // changes as Google Test goes through one test after another. When - // no test is running, this is set to NULL and Google Test stores - // assertion results in ad_hoc_test_result_. Initially NULL. - TestInfo* current_test_info_; - - // Normally, a user only writes assertions inside a TEST or TEST_F, - // or inside a function called by a TEST or TEST_F. Since Google - // Test keeps track of which test is current running, it can - // associate such an assertion with the test it belongs to. - // - // If an assertion is encountered when no TEST or TEST_F is running, - // Google Test attributes the assertion result to an imaginary "ad hoc" - // test, and records the result in ad_hoc_test_result_. - TestResult ad_hoc_test_result_; - - // The list of event listeners that can be used to track events inside - // Google Test. - TestEventListeners listeners_; - - // The OS stack trace getter. Will be deleted when the UnitTest - // object is destructed. By default, an OsStackTraceGetter is used, - // but the user can set this field to use a custom getter if that is - // desired. - OsStackTraceGetterInterface* os_stack_trace_getter_; - - // True iff PostFlagParsingInit() has been called. - bool post_flag_parse_init_performed_; - - // The random number seed used at the beginning of the test run. - int random_seed_; - - // Our random number generator. - internal::Random random_; - - // The time of the test program start, in ms from the start of the - // UNIX epoch. - TimeInMillis start_timestamp_; - - // How long the test took to run, in milliseconds. - TimeInMillis elapsed_time_; - -#if GTEST_HAS_DEATH_TEST - // The decomposed components of the gtest_internal_run_death_test flag, - // parsed when RUN_ALL_TESTS is called. - internal::scoped_ptr internal_run_death_test_flag_; - internal::scoped_ptr death_test_factory_; -#endif // GTEST_HAS_DEATH_TEST - - // A per-thread stack of traces created by the SCOPED_TRACE() macro. - internal::ThreadLocal > gtest_trace_stack_; - - // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests() - // starts. - bool catch_exceptions_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl); -}; // class UnitTestImpl - -// Convenience function for accessing the global UnitTest -// implementation object. -inline UnitTestImpl* GetUnitTestImpl() { - return UnitTest::GetInstance()->impl(); -} - -#if GTEST_USES_SIMPLE_RE - -// Internal helper functions for implementing the simple regular -// expression matcher. -GTEST_API_ bool IsInSet(char ch, const char* str); -GTEST_API_ bool IsAsciiDigit(char ch); -GTEST_API_ bool IsAsciiPunct(char ch); -GTEST_API_ bool IsRepeat(char ch); -GTEST_API_ bool IsAsciiWhiteSpace(char ch); -GTEST_API_ bool IsAsciiWordChar(char ch); -GTEST_API_ bool IsValidEscape(char ch); -GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); -GTEST_API_ bool ValidateRegex(const char* regex); -GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); -GTEST_API_ bool MatchRepetitionAndRegexAtHead( - bool escaped, char ch, char repeat, const char* regex, const char* str); -GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); - -#endif // GTEST_USES_SIMPLE_RE - -// Parses the command line for Google Test flags, without initializing -// other parts of Google Test. -GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv); -GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv); - -#if GTEST_HAS_DEATH_TEST - -// Returns the message describing the last system error, regardless of the -// platform. -GTEST_API_ std::string GetLastErrnoDescription(); - -# if GTEST_OS_WINDOWS -// Provides leak-safe Windows kernel handle ownership. -class AutoHandle { - public: - AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} - explicit AutoHandle(HANDLE handle) : handle_(handle) {} - - ~AutoHandle() { Reset(); } - - HANDLE Get() const { return handle_; } - void Reset() { Reset(INVALID_HANDLE_VALUE); } - void Reset(HANDLE handle) { - if (handle != handle_) { - if (handle_ != INVALID_HANDLE_VALUE) - ::CloseHandle(handle_); - handle_ = handle; - } - } - - private: - HANDLE handle_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); -}; -# endif // GTEST_OS_WINDOWS - -// Attempts to parse a string into a positive integer pointed to by the -// number parameter. Returns true if that is possible. -// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use -// it here. -template -bool ParseNaturalNumber(const ::std::string& str, Integer* number) { - // Fail fast if the given string does not begin with a digit; - // this bypasses strtoXXX's "optional leading whitespace and plus - // or minus sign" semantics, which are undesirable here. - if (str.empty() || !IsDigit(str[0])) { - return false; - } - errno = 0; - - char* end; - // BiggestConvertible is the largest integer type that system-provided - // string-to-number conversion routines can return. - -# if GTEST_OS_WINDOWS && !defined(__GNUC__) - - // MSVC and C++ Builder define __int64 instead of the standard long long. - typedef unsigned __int64 BiggestConvertible; - const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10); - -# else - - typedef unsigned long long BiggestConvertible; // NOLINT - const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); - -# endif // GTEST_OS_WINDOWS && !defined(__GNUC__) - - const bool parse_success = *end == '\0' && errno == 0; - - // TODO(vladl@google.com): Convert this to compile time assertion when it is - // available. - GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); - - const Integer result = static_cast(parsed); - if (parse_success && static_cast(result) == parsed) { - *number = result; - return true; - } - return false; -} -#endif // GTEST_HAS_DEATH_TEST - -// TestResult contains some private methods that should be hidden from -// Google Test user but are required for testing. This class allow our tests -// to access them. -// -// This class is supplied only for the purpose of testing Google Test's own -// constructs. Do not use it in user tests, either directly or indirectly. -class TestResultAccessor { - public: - static void RecordProperty(TestResult* test_result, - const std::string& xml_element, - const TestProperty& property) { - test_result->RecordProperty(xml_element, property); - } - - static void ClearTestPartResults(TestResult* test_result) { - test_result->ClearTestPartResults(); - } - - static const std::vector& test_part_results( - const TestResult& test_result) { - return test_result.test_part_results(); - } -}; - -#if GTEST_CAN_STREAM_RESULTS_ - -// Streams test results to the given port on the given host machine. -class StreamingListener : public EmptyTestEventListener { - public: - // Abstract base class for writing strings to a socket. - class AbstractSocketWriter { - public: - virtual ~AbstractSocketWriter() {} - - // Sends a string to the socket. - virtual void Send(const string& message) = 0; - - // Closes the socket. - virtual void CloseConnection() {} - - // Sends a string and a newline to the socket. - void SendLn(const string& message) { - Send(message + "\n"); - } - }; - - // Concrete class for actually writing strings to a socket. - class SocketWriter : public AbstractSocketWriter { - public: - SocketWriter(const string& host, const string& port) - : sockfd_(-1), host_name_(host), port_num_(port) { - MakeConnection(); - } - - virtual ~SocketWriter() { - if (sockfd_ != -1) - CloseConnection(); - } - - // Sends a string to the socket. - virtual void Send(const string& message) { - GTEST_CHECK_(sockfd_ != -1) - << "Send() can be called only when there is a connection."; - - const int len = static_cast(message.length()); - if (write(sockfd_, message.c_str(), len) != len) { - GTEST_LOG_(WARNING) - << "stream_result_to: failed to stream to " - << host_name_ << ":" << port_num_; - } - } - - private: - // Creates a client socket and connects to the server. - void MakeConnection(); - - // Closes the socket. - void CloseConnection() { - GTEST_CHECK_(sockfd_ != -1) - << "CloseConnection() can be called only when there is a connection."; - - close(sockfd_); - sockfd_ = -1; - } - - int sockfd_; // socket file descriptor - const string host_name_; - const string port_num_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter); - }; // class SocketWriter - - // Escapes '=', '&', '%', and '\n' characters in str as "%xx". - static string UrlEncode(const char* str); - - StreamingListener(const string& host, const string& port) - : socket_writer_(new SocketWriter(host, port)) { Start(); } - - explicit StreamingListener(AbstractSocketWriter* socket_writer) - : socket_writer_(socket_writer) { Start(); } - - void OnTestProgramStart(const UnitTest& /* unit_test */) { - SendLn("event=TestProgramStart"); - } - - void OnTestProgramEnd(const UnitTest& unit_test) { - // Note that Google Test current only report elapsed time for each - // test iteration, not for the entire test program. - SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed())); - - // Notify the streaming server to stop. - socket_writer_->CloseConnection(); - } - - void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) { - SendLn("event=TestIterationStart&iteration=" + - StreamableToString(iteration)); - } - - void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) { - SendLn("event=TestIterationEnd&passed=" + - FormatBool(unit_test.Passed()) + "&elapsed_time=" + - StreamableToString(unit_test.elapsed_time()) + "ms"); - } - - void OnTestCaseStart(const TestCase& test_case) { - SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); - } - - void OnTestCaseEnd(const TestCase& test_case) { - SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) - + "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) - + "ms"); - } - - void OnTestStart(const TestInfo& test_info) { - SendLn(std::string("event=TestStart&name=") + test_info.name()); - } - - void OnTestEnd(const TestInfo& test_info) { - SendLn("event=TestEnd&passed=" + - FormatBool((test_info.result())->Passed()) + - "&elapsed_time=" + - StreamableToString((test_info.result())->elapsed_time()) + "ms"); - } - - void OnTestPartResult(const TestPartResult& test_part_result) { - const char* file_name = test_part_result.file_name(); - if (file_name == NULL) - file_name = ""; - SendLn("event=TestPartResult&file=" + UrlEncode(file_name) + - "&line=" + StreamableToString(test_part_result.line_number()) + - "&message=" + UrlEncode(test_part_result.message())); - } - - private: - // Sends the given message and a newline to the socket. - void SendLn(const string& message) { socket_writer_->SendLn(message); } - - // Called at the start of streaming to notify the receiver what - // protocol we are using. - void Start() { SendLn("gtest_streaming_protocol_version=1.0"); } - - string FormatBool(bool value) { return value ? "1" : "0"; } - - const scoped_ptr socket_writer_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); -}; // class StreamingListener - -#endif // GTEST_CAN_STREAM_RESULTS_ - -} // namespace internal -} // namespace testing - -#endif // GTEST_SRC_GTEST_INTERNAL_INL_H_ -#undef GTEST_IMPLEMENTATION_ - -#if GTEST_OS_WINDOWS -# define vsnprintf _vsnprintf -#endif // GTEST_OS_WINDOWS - -namespace testing { - -using internal::CountIf; -using internal::ForEach; -using internal::GetElementOr; -using internal::Shuffle; - -// Constants. - -// A test whose test case name or test name matches this filter is -// disabled and not run. -static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; - -// A test case whose name matches this filter is considered a death -// test case and will be run before test cases whose name doesn't -// match this filter. -static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*"; - -// A test filter that matches everything. -static const char kUniversalFilter[] = "*"; - -// The default output file for XML output. -static const char kDefaultOutputFile[] = "test_detail.xml"; - -// The environment variable name for the test shard index. -static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; -// The environment variable name for the total number of test shards. -static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS"; -// The environment variable name for the test shard status file. -static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE"; - -namespace internal { - -// The text used in failure messages to indicate the start of the -// stack trace. -const char kStackTraceMarker[] = "\nStack trace:\n"; - -// g_help_flag is true iff the --help flag or an equivalent form is -// specified on the command line. -bool g_help_flag = false; - -} // namespace internal - -static const char* GetDefaultFilter() { - return kUniversalFilter; -} - -GTEST_DEFINE_bool_( - also_run_disabled_tests, - internal::BoolFromGTestEnv("also_run_disabled_tests", false), - "Run disabled tests too, in addition to the tests normally being run."); - -GTEST_DEFINE_bool_( - break_on_failure, - internal::BoolFromGTestEnv("break_on_failure", false), - "True iff a failed assertion should be a debugger break-point."); - -GTEST_DEFINE_bool_( - catch_exceptions, - internal::BoolFromGTestEnv("catch_exceptions", true), - "True iff " GTEST_NAME_ - " should catch exceptions and treat them as test failures."); - -GTEST_DEFINE_string_( - color, - internal::StringFromGTestEnv("color", "auto"), - "Whether to use colors in the output. Valid values: yes, no, " - "and auto. 'auto' means to use colors if the output is " - "being sent to a terminal and the TERM environment variable " - "is set to a terminal type that supports colors."); - -GTEST_DEFINE_string_( - filter, - internal::StringFromGTestEnv("filter", GetDefaultFilter()), - "A colon-separated list of glob (not regex) patterns " - "for filtering the tests to run, optionally followed by a " - "'-' and a : separated list of negative patterns (tests to " - "exclude). A test is run if it matches one of the positive " - "patterns and does not match any of the negative patterns."); - -GTEST_DEFINE_bool_(list_tests, false, - "List all tests without running them."); - -GTEST_DEFINE_string_( - output, - internal::StringFromGTestEnv("output", ""), - "A format (currently must be \"xml\"), optionally followed " - "by a colon and an output file name or directory. A directory " - "is indicated by a trailing pathname separator. " - "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " - "If a directory is specified, output files will be created " - "within that directory, with file-names based on the test " - "executable's name and, if necessary, made unique by adding " - "digits."); - -GTEST_DEFINE_bool_( - print_time, - internal::BoolFromGTestEnv("print_time", true), - "True iff " GTEST_NAME_ - " should display elapsed time in text output."); - -GTEST_DEFINE_int32_( - random_seed, - internal::Int32FromGTestEnv("random_seed", 0), - "Random number seed to use when shuffling test orders. Must be in range " - "[1, 99999], or 0 to use a seed based on the current time."); - -GTEST_DEFINE_int32_( - repeat, - internal::Int32FromGTestEnv("repeat", 1), - "How many times to repeat each test. Specify a negative number " - "for repeating forever. Useful for shaking out flaky tests."); - -GTEST_DEFINE_bool_( - show_internal_stack_frames, false, - "True iff " GTEST_NAME_ " should include internal stack frames when " - "printing test failure stack traces."); - -GTEST_DEFINE_bool_( - shuffle, - internal::BoolFromGTestEnv("shuffle", false), - "True iff " GTEST_NAME_ - " should randomize tests' order on every run."); - -GTEST_DEFINE_int32_( - stack_trace_depth, - internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth), - "The maximum number of stack frames to print when an " - "assertion fails. The valid range is 0 through 100, inclusive."); - -GTEST_DEFINE_string_( - stream_result_to, - internal::StringFromGTestEnv("stream_result_to", ""), - "This flag specifies the host name and the port number on which to stream " - "test results. Example: \"localhost:555\". The flag is effective only on " - "Linux."); - -GTEST_DEFINE_bool_( - throw_on_failure, - internal::BoolFromGTestEnv("throw_on_failure", false), - "When this flag is specified, a failed assertion will throw an exception " - "if exceptions are enabled or exit the program with a non-zero code " - "otherwise."); - -namespace internal { - -// Generates a random number from [0, range), using a Linear -// Congruential Generator (LCG). Crashes if 'range' is 0 or greater -// than kMaxRange. -UInt32 Random::Generate(UInt32 range) { - // These constants are the same as are used in glibc's rand(3). - state_ = (1103515245U*state_ + 12345U) % kMaxRange; - - GTEST_CHECK_(range > 0) - << "Cannot generate a number in the range [0, 0)."; - GTEST_CHECK_(range <= kMaxRange) - << "Generation of a number in [0, " << range << ") was requested, " - << "but this can only generate numbers in [0, " << kMaxRange << ")."; - - // Converting via modulus introduces a bit of downward bias, but - // it's simple, and a linear congruential generator isn't too good - // to begin with. - return state_ % range; -} - -// GTestIsInitialized() returns true iff the user has initialized -// Google Test. Useful for catching the user mistake of not initializing -// Google Test before calling RUN_ALL_TESTS(). -// -// A user must call testing::InitGoogleTest() to initialize Google -// Test. g_init_gtest_count is set to the number of times -// InitGoogleTest() has been called. We don't protect this variable -// under a mutex as it is only accessed in the main thread. -GTEST_API_ int g_init_gtest_count = 0; -static bool GTestIsInitialized() { return g_init_gtest_count != 0; } - -// Iterates over a vector of TestCases, keeping a running sum of the -// results of calling a given int-returning method on each. -// Returns the sum. -static int SumOverTestCaseList(const std::vector& case_list, - int (TestCase::*method)() const) { - int sum = 0; - for (size_t i = 0; i < case_list.size(); i++) { - sum += (case_list[i]->*method)(); - } - return sum; -} - -// Returns true iff the test case passed. -static bool TestCasePassed(const TestCase* test_case) { - return test_case->should_run() && test_case->Passed(); -} - -// Returns true iff the test case failed. -static bool TestCaseFailed(const TestCase* test_case) { - return test_case->should_run() && test_case->Failed(); -} - -// Returns true iff test_case contains at least one test that should -// run. -static bool ShouldRunTestCase(const TestCase* test_case) { - return test_case->should_run(); -} - -// AssertHelper constructor. -AssertHelper::AssertHelper(TestPartResult::Type type, - const char* file, - int line, - const char* message) - : data_(new AssertHelperData(type, file, line, message)) { -} - -AssertHelper::~AssertHelper() { - delete data_; -} - -// Message assignment, for assertion streaming support. -void AssertHelper::operator=(const Message& message) const { - UnitTest::GetInstance()-> - AddTestPartResult(data_->type, data_->file, data_->line, - AppendUserMessage(data_->message, message), - UnitTest::GetInstance()->impl() - ->CurrentOsStackTraceExceptTop(1) - // Skips the stack frame for this function itself. - ); // NOLINT -} - -// Mutex for linked pointers. -GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex); - -// Application pathname gotten in InitGoogleTest. -std::string g_executable_path; - -// Returns the current application's name, removing directory path if that -// is present. -FilePath GetCurrentExecutableName() { - FilePath result; - -#if GTEST_OS_WINDOWS - result.Set(FilePath(g_executable_path).RemoveExtension("exe")); -#else - result.Set(FilePath(g_executable_path)); -#endif // GTEST_OS_WINDOWS - - return result.RemoveDirectoryName(); -} - -// Functions for processing the gtest_output flag. - -// Returns the output format, or "" for normal printed output. -std::string UnitTestOptions::GetOutputFormat() { - const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); - if (gtest_output_flag == NULL) return std::string(""); - - const char* const colon = strchr(gtest_output_flag, ':'); - return (colon == NULL) ? - std::string(gtest_output_flag) : - std::string(gtest_output_flag, colon - gtest_output_flag); -} - -// Returns the name of the requested output file, or the default if none -// was explicitly specified. -std::string UnitTestOptions::GetAbsolutePathToOutputFile() { - const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); - if (gtest_output_flag == NULL) - return ""; - - const char* const colon = strchr(gtest_output_flag, ':'); - if (colon == NULL) - return internal::FilePath::ConcatPaths( - internal::FilePath( - UnitTest::GetInstance()->original_working_dir()), - internal::FilePath(kDefaultOutputFile)).string(); - - internal::FilePath output_name(colon + 1); - if (!output_name.IsAbsolutePath()) - // TODO(wan@google.com): on Windows \some\path is not an absolute - // path (as its meaning depends on the current drive), yet the - // following logic for turning it into an absolute path is wrong. - // Fix it. - output_name = internal::FilePath::ConcatPaths( - internal::FilePath(UnitTest::GetInstance()->original_working_dir()), - internal::FilePath(colon + 1)); - - if (!output_name.IsDirectory()) - return output_name.string(); - - internal::FilePath result(internal::FilePath::GenerateUniqueFileName( - output_name, internal::GetCurrentExecutableName(), - GetOutputFormat().c_str())); - return result.string(); -} - -// Returns true iff the wildcard pattern matches the string. The -// first ':' or '\0' character in pattern marks the end of it. -// -// This recursive algorithm isn't very efficient, but is clear and -// works well enough for matching test names, which are short. -bool UnitTestOptions::PatternMatchesString(const char *pattern, - const char *str) { - switch (*pattern) { - case '\0': - case ':': // Either ':' or '\0' marks the end of the pattern. - return *str == '\0'; - case '?': // Matches any single character. - return *str != '\0' && PatternMatchesString(pattern + 1, str + 1); - case '*': // Matches any string (possibly empty) of characters. - return (*str != '\0' && PatternMatchesString(pattern, str + 1)) || - PatternMatchesString(pattern + 1, str); - default: // Non-special character. Matches itself. - return *pattern == *str && - PatternMatchesString(pattern + 1, str + 1); - } -} - -bool UnitTestOptions::MatchesFilter( - const std::string& name, const char* filter) { - const char *cur_pattern = filter; - for (;;) { - if (PatternMatchesString(cur_pattern, name.c_str())) { - return true; - } - - // Finds the next pattern in the filter. - cur_pattern = strchr(cur_pattern, ':'); - - // Returns if no more pattern can be found. - if (cur_pattern == NULL) { - return false; - } - - // Skips the pattern separater (the ':' character). - cur_pattern++; - } -} - -// Returns true iff the user-specified filter matches the test case -// name and the test name. -bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name, - const std::string &test_name) { - const std::string& full_name = test_case_name + "." + test_name.c_str(); - - // Split --gtest_filter at '-', if there is one, to separate into - // positive filter and negative filter portions - const char* const p = GTEST_FLAG(filter).c_str(); - const char* const dash = strchr(p, '-'); - std::string positive; - std::string negative; - if (dash == NULL) { - positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter - negative = ""; - } else { - positive = std::string(p, dash); // Everything up to the dash - negative = std::string(dash + 1); // Everything after the dash - if (positive.empty()) { - // Treat '-test1' as the same as '*-test1' - positive = kUniversalFilter; - } - } - - // A filter is a colon-separated list of patterns. It matches a - // test if any pattern in it matches the test. - return (MatchesFilter(full_name, positive.c_str()) && - !MatchesFilter(full_name, negative.c_str())); -} - -#if GTEST_HAS_SEH -// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the -// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. -// This function is useful as an __except condition. -int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { - // Google Test should handle a SEH exception if: - // 1. the user wants it to, AND - // 2. this is not a breakpoint exception, AND - // 3. this is not a C++ exception (VC++ implements them via SEH, - // apparently). - // - // SEH exception code for C++ exceptions. - // (see http://support.microsoft.com/kb/185294 for more information). - const DWORD kCxxExceptionCode = 0xe06d7363; - - bool should_handle = true; - - if (!GTEST_FLAG(catch_exceptions)) - should_handle = false; - else if (exception_code == EXCEPTION_BREAKPOINT) - should_handle = false; - else if (exception_code == kCxxExceptionCode) - should_handle = false; - - return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; -} -#endif // GTEST_HAS_SEH - -} // namespace internal - -// The c'tor sets this object as the test part result reporter used by -// Google Test. The 'result' parameter specifies where to report the -// results. Intercepts only failures from the current thread. -ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( - TestPartResultArray* result) - : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), - result_(result) { - Init(); -} - -// The c'tor sets this object as the test part result reporter used by -// Google Test. The 'result' parameter specifies where to report the -// results. -ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( - InterceptMode intercept_mode, TestPartResultArray* result) - : intercept_mode_(intercept_mode), - result_(result) { - Init(); -} - -void ScopedFakeTestPartResultReporter::Init() { - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - if (intercept_mode_ == INTERCEPT_ALL_THREADS) { - old_reporter_ = impl->GetGlobalTestPartResultReporter(); - impl->SetGlobalTestPartResultReporter(this); - } else { - old_reporter_ = impl->GetTestPartResultReporterForCurrentThread(); - impl->SetTestPartResultReporterForCurrentThread(this); - } -} - -// The d'tor restores the test part result reporter used by Google Test -// before. -ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() { - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - if (intercept_mode_ == INTERCEPT_ALL_THREADS) { - impl->SetGlobalTestPartResultReporter(old_reporter_); - } else { - impl->SetTestPartResultReporterForCurrentThread(old_reporter_); - } -} - -// Increments the test part result count and remembers the result. -// This method is from the TestPartResultReporterInterface interface. -void ScopedFakeTestPartResultReporter::ReportTestPartResult( - const TestPartResult& result) { - result_->Append(result); -} - -namespace internal { - -// Returns the type ID of ::testing::Test. We should always call this -// instead of GetTypeId< ::testing::Test>() to get the type ID of -// testing::Test. This is to work around a suspected linker bug when -// using Google Test as a framework on Mac OS X. The bug causes -// GetTypeId< ::testing::Test>() to return different values depending -// on whether the call is from the Google Test framework itself or -// from user test code. GetTestTypeId() is guaranteed to always -// return the same value, as it always calls GetTypeId<>() from the -// gtest.cc, which is within the Google Test framework. -TypeId GetTestTypeId() { - return GetTypeId(); -} - -// The value of GetTestTypeId() as seen from within the Google Test -// library. This is solely for testing GetTestTypeId(). -extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); - -// This predicate-formatter checks that 'results' contains a test part -// failure of the given type and that the failure message contains the -// given substring. -AssertionResult HasOneFailure(const char* /* results_expr */, - const char* /* type_expr */, - const char* /* substr_expr */, - const TestPartResultArray& results, - TestPartResult::Type type, - const string& substr) { - const std::string expected(type == TestPartResult::kFatalFailure ? - "1 fatal failure" : - "1 non-fatal failure"); - Message msg; - if (results.size() != 1) { - msg << "Expected: " << expected << "\n" - << " Actual: " << results.size() << " failures"; - for (int i = 0; i < results.size(); i++) { - msg << "\n" << results.GetTestPartResult(i); - } - return AssertionFailure() << msg; - } - - const TestPartResult& r = results.GetTestPartResult(0); - if (r.type() != type) { - return AssertionFailure() << "Expected: " << expected << "\n" - << " Actual:\n" - << r; - } - - if (strstr(r.message(), substr.c_str()) == NULL) { - return AssertionFailure() << "Expected: " << expected << " containing \"" - << substr << "\"\n" - << " Actual:\n" - << r; - } - - return AssertionSuccess(); -} - -// The constructor of SingleFailureChecker remembers where to look up -// test part results, what type of failure we expect, and what -// substring the failure message should contain. -SingleFailureChecker:: SingleFailureChecker( - const TestPartResultArray* results, - TestPartResult::Type type, - const string& substr) - : results_(results), - type_(type), - substr_(substr) {} - -// The destructor of SingleFailureChecker verifies that the given -// TestPartResultArray contains exactly one failure that has the given -// type and contains the given substring. If that's not the case, a -// non-fatal failure will be generated. -SingleFailureChecker::~SingleFailureChecker() { - EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_); -} - -DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( - UnitTestImpl* unit_test) : unit_test_(unit_test) {} - -void DefaultGlobalTestPartResultReporter::ReportTestPartResult( - const TestPartResult& result) { - unit_test_->current_test_result()->AddTestPartResult(result); - unit_test_->listeners()->repeater()->OnTestPartResult(result); -} - -DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( - UnitTestImpl* unit_test) : unit_test_(unit_test) {} - -void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( - const TestPartResult& result) { - unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result); -} - -// Returns the global test part result reporter. -TestPartResultReporterInterface* -UnitTestImpl::GetGlobalTestPartResultReporter() { - internal::MutexLock lock(&global_test_part_result_reporter_mutex_); - return global_test_part_result_repoter_; -} - -// Sets the global test part result reporter. -void UnitTestImpl::SetGlobalTestPartResultReporter( - TestPartResultReporterInterface* reporter) { - internal::MutexLock lock(&global_test_part_result_reporter_mutex_); - global_test_part_result_repoter_ = reporter; -} - -// Returns the test part result reporter for the current thread. -TestPartResultReporterInterface* -UnitTestImpl::GetTestPartResultReporterForCurrentThread() { - return per_thread_test_part_result_reporter_.get(); -} - -// Sets the test part result reporter for the current thread. -void UnitTestImpl::SetTestPartResultReporterForCurrentThread( - TestPartResultReporterInterface* reporter) { - per_thread_test_part_result_reporter_.set(reporter); -} - -// Gets the number of successful test cases. -int UnitTestImpl::successful_test_case_count() const { - return CountIf(test_cases_, TestCasePassed); -} - -// Gets the number of failed test cases. -int UnitTestImpl::failed_test_case_count() const { - return CountIf(test_cases_, TestCaseFailed); -} - -// Gets the number of all test cases. -int UnitTestImpl::total_test_case_count() const { - return static_cast(test_cases_.size()); -} - -// Gets the number of all test cases that contain at least one test -// that should run. -int UnitTestImpl::test_case_to_run_count() const { - return CountIf(test_cases_, ShouldRunTestCase); -} - -// Gets the number of successful tests. -int UnitTestImpl::successful_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count); -} - -// Gets the number of failed tests. -int UnitTestImpl::failed_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count); -} - -// Gets the number of disabled tests that will be reported in the XML report. -int UnitTestImpl::reportable_disabled_test_count() const { - return SumOverTestCaseList(test_cases_, - &TestCase::reportable_disabled_test_count); -} - -// Gets the number of disabled tests. -int UnitTestImpl::disabled_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count); -} - -// Gets the number of tests to be printed in the XML report. -int UnitTestImpl::reportable_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count); -} - -// Gets the number of all tests. -int UnitTestImpl::total_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::total_test_count); -} - -// Gets the number of tests that should run. -int UnitTestImpl::test_to_run_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count); -} - -// Returns the current OS stack trace as an std::string. -// -// The maximum number of stack frames to be included is specified by -// the gtest_stack_trace_depth flag. The skip_count parameter -// specifies the number of top frames to be skipped, which doesn't -// count against the number of frames to be included. -// -// For example, if Foo() calls Bar(), which in turn calls -// CurrentOsStackTraceExceptTop(1), Foo() will be included in the -// trace but Bar() and CurrentOsStackTraceExceptTop() won't. -std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { - (void)skip_count; - return ""; -} - -// Returns the current time in milliseconds. -TimeInMillis GetTimeInMillis() { -#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__) - // Difference between 1970-01-01 and 1601-01-01 in milliseconds. - // http://analogous.blogspot.com/2005/04/epoch.html - const TimeInMillis kJavaEpochToWinFileTimeDelta = - static_cast(116444736UL) * 100000UL; - const DWORD kTenthMicrosInMilliSecond = 10000; - - SYSTEMTIME now_systime; - FILETIME now_filetime; - ULARGE_INTEGER now_int64; - // TODO(kenton@google.com): Shouldn't this just use - // GetSystemTimeAsFileTime()? - GetSystemTime(&now_systime); - if (SystemTimeToFileTime(&now_systime, &now_filetime)) { - now_int64.LowPart = now_filetime.dwLowDateTime; - now_int64.HighPart = now_filetime.dwHighDateTime; - now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) - - kJavaEpochToWinFileTimeDelta; - return now_int64.QuadPart; - } - return 0; -#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_ - __timeb64 now; - -# ifdef _MSC_VER - - // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 - // (deprecated function) there. - // TODO(kenton@google.com): Use GetTickCount()? Or use - // SystemTimeToFileTime() -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4996) // Temporarily disables warning 4996. - _ftime64(&now); -# pragma warning(pop) // Restores the warning state. -# else - - _ftime64(&now); - -# endif // _MSC_VER - - return static_cast(now.time) * 1000 + now.millitm; -#elif GTEST_HAS_GETTIMEOFDAY_ - struct timeval now; - gettimeofday(&now, NULL); - return static_cast(now.tv_sec) * 1000 + now.tv_usec / 1000; -#else -# error "Don't know how to get the current time on your system." -#endif -} - -// Utilities - -// class String. - -#if GTEST_OS_WINDOWS_MOBILE -// Creates a UTF-16 wide string from the given ANSI string, allocating -// memory using new. The caller is responsible for deleting the return -// value using delete[]. Returns the wide string, or NULL if the -// input is NULL. -LPCWSTR String::AnsiToUtf16(const char* ansi) { - if (!ansi) return NULL; - const int length = strlen(ansi); - const int unicode_length = - MultiByteToWideChar(CP_ACP, 0, ansi, length, - NULL, 0); - WCHAR* unicode = new WCHAR[unicode_length + 1]; - MultiByteToWideChar(CP_ACP, 0, ansi, length, - unicode, unicode_length); - unicode[unicode_length] = 0; - return unicode; -} - -// Creates an ANSI string from the given wide string, allocating -// memory using new. The caller is responsible for deleting the return -// value using delete[]. Returns the ANSI string, or NULL if the -// input is NULL. -const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { - if (!utf16_str) return NULL; - const int ansi_length = - WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, - NULL, 0, NULL, NULL); - char* ansi = new char[ansi_length + 1]; - WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, - ansi, ansi_length, NULL, NULL); - ansi[ansi_length] = 0; - return ansi; -} - -#endif // GTEST_OS_WINDOWS_MOBILE - -// Compares two C strings. Returns true iff they have the same content. -// -// Unlike strcmp(), this function can handle NULL argument(s). A NULL -// C string is considered different to any non-NULL C string, -// including the empty string. -bool String::CStringEquals(const char * lhs, const char * rhs) { - if ( lhs == NULL ) return rhs == NULL; - - if ( rhs == NULL ) return false; - - return strcmp(lhs, rhs) == 0; -} - -#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING - -// Converts an array of wide chars to a narrow string using the UTF-8 -// encoding, and streams the result to the given Message object. -static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, - Message* msg) { - for (size_t i = 0; i != length; ) { // NOLINT - if (wstr[i] != L'\0') { - *msg << WideStringToUtf8(wstr + i, static_cast(length - i)); - while (i != length && wstr[i] != L'\0') - i++; - } else { - *msg << '\0'; - i++; - } - } -} - -#endif // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING - -} // namespace internal - -// Constructs an empty Message. -// We allocate the stringstream separately because otherwise each use of -// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's -// stack frame leading to huge stack frames in some cases; gcc does not reuse -// the stack space. -Message::Message() : ss_(new ::std::stringstream) { - // By default, we want there to be enough precision when printing - // a double to a Message. - *ss_ << std::setprecision(std::numeric_limits::digits10 + 2); -} - -// These two overloads allow streaming a wide C string to a Message -// using the UTF-8 encoding. -Message& Message::operator <<(const wchar_t* wide_c_str) { - return *this << internal::String::ShowWideCString(wide_c_str); -} -Message& Message::operator <<(wchar_t* wide_c_str) { - return *this << internal::String::ShowWideCString(wide_c_str); -} - -#if GTEST_HAS_STD_WSTRING -// Converts the given wide string to a narrow string using the UTF-8 -// encoding, and streams the result to this Message object. -Message& Message::operator <<(const ::std::wstring& wstr) { - internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); - return *this; -} -#endif // GTEST_HAS_STD_WSTRING - -#if GTEST_HAS_GLOBAL_WSTRING -// Converts the given wide string to a narrow string using the UTF-8 -// encoding, and streams the result to this Message object. -Message& Message::operator <<(const ::wstring& wstr) { - internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); - return *this; -} -#endif // GTEST_HAS_GLOBAL_WSTRING - -// Gets the text streamed to this object so far as an std::string. -// Each '\0' character in the buffer is replaced with "\\0". -std::string Message::GetString() const { - return internal::StringStreamToString(ss_.get()); -} - -// AssertionResult constructors. -// Used in EXPECT_TRUE/FALSE(assertion_result). -AssertionResult::AssertionResult(const AssertionResult& other) - : success_(other.success_), - message_(other.message_.get() != NULL ? - new ::std::string(*other.message_) : - static_cast< ::std::string*>(NULL)) { -} - -// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. -AssertionResult AssertionResult::operator!() const { - AssertionResult negation(!success_); - if (message_.get() != NULL) - negation << *message_; - return negation; -} - -// Makes a successful assertion result. -AssertionResult AssertionSuccess() { - return AssertionResult(true); -} - -// Makes a failed assertion result. -AssertionResult AssertionFailure() { - return AssertionResult(false); -} - -// Makes a failed assertion result with the given failure message. -// Deprecated; use AssertionFailure() << message. -AssertionResult AssertionFailure(const Message& message) { - return AssertionFailure() << message; -} - -namespace internal { - -// Constructs and returns the message for an equality assertion -// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. -// -// The first four parameters are the expressions used in the assertion -// and their values, as strings. For example, for ASSERT_EQ(foo, bar) -// where foo is 5 and bar is 6, we have: -// -// expected_expression: "foo" -// actual_expression: "bar" -// expected_value: "5" -// actual_value: "6" -// -// The ignoring_case parameter is true iff the assertion is a -// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will -// be inserted into the message. -AssertionResult EqFailure(const char* expected_expression, - const char* actual_expression, - const std::string& expected_value, - const std::string& actual_value, - bool ignoring_case) { - Message msg; - msg << "Value of: " << actual_expression; - if (actual_value != actual_expression) { - msg << "\n Actual: " << actual_value; - } - - msg << "\nExpected: " << expected_expression; - if (ignoring_case) { - msg << " (ignoring case)"; - } - if (expected_value != expected_expression) { - msg << "\nWhich is: " << expected_value; - } - - return AssertionFailure() << msg; -} - -// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. -std::string GetBoolAssertionFailureMessage( - const AssertionResult& assertion_result, - const char* expression_text, - const char* actual_predicate_value, - const char* expected_predicate_value) { - const char* actual_message = assertion_result.message(); - Message msg; - msg << "Value of: " << expression_text - << "\n Actual: " << actual_predicate_value; - if (actual_message[0] != '\0') - msg << " (" << actual_message << ")"; - msg << "\nExpected: " << expected_predicate_value; - return msg.GetString(); -} - -// Helper function for implementing ASSERT_NEAR. -AssertionResult DoubleNearPredFormat(const char* expr1, - const char* expr2, - const char* abs_error_expr, - double val1, - double val2, - double abs_error) { - const double diff = fabs(val1 - val2); - if (diff <= abs_error) return AssertionSuccess(); - - // TODO(wan): do not print the value of an expression if it's - // already a literal. - return AssertionFailure() - << "The difference between " << expr1 << " and " << expr2 - << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" - << expr1 << " evaluates to " << val1 << ",\n" - << expr2 << " evaluates to " << val2 << ", and\n" - << abs_error_expr << " evaluates to " << abs_error << "."; -} - - -// Helper template for implementing FloatLE() and DoubleLE(). -template -AssertionResult FloatingPointLE(const char* expr1, - const char* expr2, - RawType val1, - RawType val2) { - // Returns success if val1 is less than val2, - if (val1 < val2) { - return AssertionSuccess(); - } - - // or if val1 is almost equal to val2. - const FloatingPoint lhs(val1), rhs(val2); - if (lhs.AlmostEquals(rhs)) { - return AssertionSuccess(); - } - - // Note that the above two checks will both fail if either val1 or - // val2 is NaN, as the IEEE floating-point standard requires that - // any predicate involving a NaN must return false. - - ::std::stringstream val1_ss; - val1_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << val1; - - ::std::stringstream val2_ss; - val2_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << val2; - - return AssertionFailure() - << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" - << " Actual: " << StringStreamToString(&val1_ss) << " vs " - << StringStreamToString(&val2_ss); -} - -} // namespace internal - -// Asserts that val1 is less than, or almost equal to, val2. Fails -// otherwise. In particular, it fails if either val1 or val2 is NaN. -AssertionResult FloatLE(const char* expr1, const char* expr2, - float val1, float val2) { - return internal::FloatingPointLE(expr1, expr2, val1, val2); -} - -// Asserts that val1 is less than, or almost equal to, val2. Fails -// otherwise. In particular, it fails if either val1 or val2 is NaN. -AssertionResult DoubleLE(const char* expr1, const char* expr2, - double val1, double val2) { - return internal::FloatingPointLE(expr1, expr2, val1, val2); -} - -namespace internal { - -// The helper function for {ASSERT|EXPECT}_EQ with int or enum -// arguments. -AssertionResult CmpHelperEQ(const char* expected_expression, - const char* actual_expression, - BiggestInt expected, - BiggestInt actual) { - if (expected == actual) { - return AssertionSuccess(); - } - - return EqFailure(expected_expression, - actual_expression, - FormatForComparisonFailureMessage(expected, actual), - FormatForComparisonFailureMessage(actual, expected), - false); -} - -// A macro for implementing the helper functions needed to implement -// ASSERT_?? and EXPECT_?? with integer or enum arguments. It is here -// just to avoid copy-and-paste of similar code. -#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ -AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ - BiggestInt val1, BiggestInt val2) {\ - if (val1 op val2) {\ - return AssertionSuccess();\ - } else {\ - return AssertionFailure() \ - << "Expected: (" << expr1 << ") " #op " (" << expr2\ - << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ - << " vs " << FormatForComparisonFailureMessage(val2, val1);\ - }\ -} - -// Implements the helper function for {ASSERT|EXPECT}_NE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(NE, !=) -// Implements the helper function for {ASSERT|EXPECT}_LE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(LE, <=) -// Implements the helper function for {ASSERT|EXPECT}_LT with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(LT, < ) -// Implements the helper function for {ASSERT|EXPECT}_GE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(GE, >=) -// Implements the helper function for {ASSERT|EXPECT}_GT with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(GT, > ) - -#undef GTEST_IMPL_CMP_HELPER_ - -// The helper function for {ASSERT|EXPECT}_STREQ. -AssertionResult CmpHelperSTREQ(const char* expected_expression, - const char* actual_expression, - const char* expected, - const char* actual) { - if (String::CStringEquals(expected, actual)) { - return AssertionSuccess(); - } - - return EqFailure(expected_expression, - actual_expression, - PrintToString(expected), - PrintToString(actual), - false); -} - -// The helper function for {ASSERT|EXPECT}_STRCASEEQ. -AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression, - const char* actual_expression, - const char* expected, - const char* actual) { - if (String::CaseInsensitiveCStringEquals(expected, actual)) { - return AssertionSuccess(); - } - - return EqFailure(expected_expression, - actual_expression, - PrintToString(expected), - PrintToString(actual), - true); -} - -// The helper function for {ASSERT|EXPECT}_STRNE. -AssertionResult CmpHelperSTRNE(const char* s1_expression, - const char* s2_expression, - const char* s1, - const char* s2) { - if (!String::CStringEquals(s1, s2)) { - return AssertionSuccess(); - } else { - return AssertionFailure() << "Expected: (" << s1_expression << ") != (" - << s2_expression << "), actual: \"" - << s1 << "\" vs \"" << s2 << "\""; - } -} - -// The helper function for {ASSERT|EXPECT}_STRCASENE. -AssertionResult CmpHelperSTRCASENE(const char* s1_expression, - const char* s2_expression, - const char* s1, - const char* s2) { - if (!String::CaseInsensitiveCStringEquals(s1, s2)) { - return AssertionSuccess(); - } else { - return AssertionFailure() - << "Expected: (" << s1_expression << ") != (" - << s2_expression << ") (ignoring case), actual: \"" - << s1 << "\" vs \"" << s2 << "\""; - } -} - -} // namespace internal - -namespace { - -// Helper functions for implementing IsSubString() and IsNotSubstring(). - -// This group of overloaded functions return true iff needle is a -// substring of haystack. NULL is considered a substring of itself -// only. - -bool IsSubstringPred(const char* needle, const char* haystack) { - if (needle == NULL || haystack == NULL) - return needle == haystack; - - return strstr(haystack, needle) != NULL; -} - -bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { - if (needle == NULL || haystack == NULL) - return needle == haystack; - - return wcsstr(haystack, needle) != NULL; -} - -// StringType here can be either ::std::string or ::std::wstring. -template -bool IsSubstringPred(const StringType& needle, - const StringType& haystack) { - return haystack.find(needle) != StringType::npos; -} - -// This function implements either IsSubstring() or IsNotSubstring(), -// depending on the value of the expected_to_be_substring parameter. -// StringType here can be const char*, const wchar_t*, ::std::string, -// or ::std::wstring. -template -AssertionResult IsSubstringImpl( - bool expected_to_be_substring, - const char* needle_expr, const char* haystack_expr, - const StringType& needle, const StringType& haystack) { - if (IsSubstringPred(needle, haystack) == expected_to_be_substring) - return AssertionSuccess(); - - const bool is_wide_string = sizeof(needle[0]) > 1; - const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; - return AssertionFailure() - << "Value of: " << needle_expr << "\n" - << " Actual: " << begin_string_quote << needle << "\"\n" - << "Expected: " << (expected_to_be_substring ? "" : "not ") - << "a substring of " << haystack_expr << "\n" - << "Which is: " << begin_string_quote << haystack << "\""; -} - -} // namespace - -// IsSubstring() and IsNotSubstring() check whether needle is a -// substring of haystack (NULL is considered a substring of itself -// only), and return an appropriate error message when they fail. - -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack) { - return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack) { - return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack) { - return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack) { - return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack) { - return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack) { - return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); -} - -#if GTEST_HAS_STD_WSTRING -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack) { - return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack) { - return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); -} -#endif // GTEST_HAS_STD_WSTRING - -namespace internal { - -#if GTEST_OS_WINDOWS - -namespace { - -// Helper function for IsHRESULT{SuccessFailure} predicates -AssertionResult HRESULTFailureHelper(const char* expr, - const char* expected, - long hr) { // NOLINT -# if GTEST_OS_WINDOWS_MOBILE - - // Windows CE doesn't support FormatMessage. - const char error_text[] = ""; - -# else - - // Looks up the human-readable system message for the HRESULT code - // and since we're not passing any params to FormatMessage, we don't - // want inserts expanded. - const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS; - const DWORD kBufSize = 4096; - // Gets the system's human readable message string for this HRESULT. - char error_text[kBufSize] = { '\0' }; - DWORD message_length = ::FormatMessageA(kFlags, - 0, // no source, we're asking system - hr, // the error - 0, // no line width restrictions - error_text, // output buffer - kBufSize, // buf size - NULL); // no arguments for inserts - // Trims tailing white space (FormatMessage leaves a trailing CR-LF) - for (; message_length && IsSpace(error_text[message_length - 1]); - --message_length) { - error_text[message_length - 1] = '\0'; - } - -# endif // GTEST_OS_WINDOWS_MOBILE - - const std::string error_hex("0x" + String::FormatHexInt(hr)); - return ::testing::AssertionFailure() - << "Expected: " << expr << " " << expected << ".\n" - << " Actual: " << error_hex << " " << error_text << "\n"; -} - -} // namespace - -AssertionResult IsHRESULTSuccess(const char* expr, long hr) { // NOLINT - if (SUCCEEDED(hr)) { - return AssertionSuccess(); - } - return HRESULTFailureHelper(expr, "succeeds", hr); -} - -AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT - if (FAILED(hr)) { - return AssertionSuccess(); - } - return HRESULTFailureHelper(expr, "fails", hr); -} - -#endif // GTEST_OS_WINDOWS - -// Utility functions for encoding Unicode text (wide strings) in -// UTF-8. - -// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8 -// like this: -// -// Code-point length Encoding -// 0 - 7 bits 0xxxxxxx -// 8 - 11 bits 110xxxxx 10xxxxxx -// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx -// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - -// The maximum code-point a one-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint1 = (static_cast(1) << 7) - 1; - -// The maximum code-point a two-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; - -// The maximum code-point a three-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint3 = (static_cast(1) << (4 + 2*6)) - 1; - -// The maximum code-point a four-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint4 = (static_cast(1) << (3 + 3*6)) - 1; - -// Chops off the n lowest bits from a bit pattern. Returns the n -// lowest bits. As a side effect, the original bit pattern will be -// shifted to the right by n bits. -inline UInt32 ChopLowBits(UInt32* bits, int n) { - const UInt32 low_bits = *bits & ((static_cast(1) << n) - 1); - *bits >>= n; - return low_bits; -} - -// Converts a Unicode code point to a narrow string in UTF-8 encoding. -// code_point parameter is of type UInt32 because wchar_t may not be -// wide enough to contain a code point. -// If the code_point is not a valid Unicode code point -// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted -// to "(Invalid Unicode 0xXXXXXXXX)". -std::string CodePointToUtf8(UInt32 code_point) { - if (code_point > kMaxCodePoint4) { - return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")"; - } - - char str[5]; // Big enough for the largest valid code point. - if (code_point <= kMaxCodePoint1) { - str[1] = '\0'; - str[0] = static_cast(code_point); // 0xxxxxxx - } else if (code_point <= kMaxCodePoint2) { - str[2] = '\0'; - str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[0] = static_cast(0xC0 | code_point); // 110xxxxx - } else if (code_point <= kMaxCodePoint3) { - str[3] = '\0'; - str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[0] = static_cast(0xE0 | code_point); // 1110xxxx - } else { // code_point <= kMaxCodePoint4 - str[4] = '\0'; - str[3] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[0] = static_cast(0xF0 | code_point); // 11110xxx - } - return str; -} - -// The following two functions only make sense if the the system -// uses UTF-16 for wide string encoding. All supported systems -// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16. - -// Determines if the arguments constitute UTF-16 surrogate pair -// and thus should be combined into a single Unicode code point -// using CreateCodePointFromUtf16SurrogatePair. -inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { - return sizeof(wchar_t) == 2 && - (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00; -} - -// Creates a Unicode code point from UTF16 surrogate pair. -inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first, - wchar_t second) { - const UInt32 mask = (1 << 10) - 1; - return (sizeof(wchar_t) == 2) ? - (((first & mask) << 10) | (second & mask)) + 0x10000 : - // This function should not be called when the condition is - // false, but we provide a sensible default in case it is. - static_cast(first); -} - -// Converts a wide string to a narrow string in UTF-8 encoding. -// The wide string is assumed to have the following encoding: -// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) -// UTF-32 if sizeof(wchar_t) == 4 (on Linux) -// Parameter str points to a null-terminated wide string. -// Parameter num_chars may additionally limit the number -// of wchar_t characters processed. -1 is used when the entire string -// should be processed. -// If the string contains code points that are not valid Unicode code points -// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output -// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding -// and contains invalid UTF-16 surrogate pairs, values in those pairs -// will be encoded as individual Unicode characters from Basic Normal Plane. -std::string WideStringToUtf8(const wchar_t* str, int num_chars) { - if (num_chars == -1) - num_chars = static_cast(wcslen(str)); - - ::std::stringstream stream; - for (int i = 0; i < num_chars; ++i) { - UInt32 unicode_code_point; - - if (str[i] == L'\0') { - break; - } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { - unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i], - str[i + 1]); - i++; - } else { - unicode_code_point = static_cast(str[i]); - } - - stream << CodePointToUtf8(unicode_code_point); - } - return StringStreamToString(&stream); -} - -// Converts a wide C string to an std::string using the UTF-8 encoding. -// NULL will be converted to "(null)". -std::string String::ShowWideCString(const wchar_t * wide_c_str) { - if (wide_c_str == NULL) return "(null)"; - - return internal::WideStringToUtf8(wide_c_str, -1); -} - -// Compares two wide C strings. Returns true iff they have the same -// content. -// -// Unlike wcscmp(), this function can handle NULL argument(s). A NULL -// C string is considered different to any non-NULL C string, -// including the empty string. -bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { - if (lhs == NULL) return rhs == NULL; - - if (rhs == NULL) return false; - - return wcscmp(lhs, rhs) == 0; -} - -// Helper function for *_STREQ on wide strings. -AssertionResult CmpHelperSTREQ(const char* expected_expression, - const char* actual_expression, - const wchar_t* expected, - const wchar_t* actual) { - if (String::WideCStringEquals(expected, actual)) { - return AssertionSuccess(); - } - - return EqFailure(expected_expression, - actual_expression, - PrintToString(expected), - PrintToString(actual), - false); -} - -// Helper function for *_STRNE on wide strings. -AssertionResult CmpHelperSTRNE(const char* s1_expression, - const char* s2_expression, - const wchar_t* s1, - const wchar_t* s2) { - if (!String::WideCStringEquals(s1, s2)) { - return AssertionSuccess(); - } - - return AssertionFailure() << "Expected: (" << s1_expression << ") != (" - << s2_expression << "), actual: " - << PrintToString(s1) - << " vs " << PrintToString(s2); -} - -// Compares two C strings, ignoring case. Returns true iff they have -// the same content. -// -// Unlike strcasecmp(), this function can handle NULL argument(s). A -// NULL C string is considered different to any non-NULL C string, -// including the empty string. -bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) { - if (lhs == NULL) - return rhs == NULL; - if (rhs == NULL) - return false; - return posix::StrCaseCmp(lhs, rhs) == 0; -} - - // Compares two wide C strings, ignoring case. Returns true iff they - // have the same content. - // - // Unlike wcscasecmp(), this function can handle NULL argument(s). - // A NULL C string is considered different to any non-NULL wide C string, - // including the empty string. - // NB: The implementations on different platforms slightly differ. - // On windows, this method uses _wcsicmp which compares according to LC_CTYPE - // environment variable. On GNU platform this method uses wcscasecmp - // which compares according to LC_CTYPE category of the current locale. - // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the - // current locale. -bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, - const wchar_t* rhs) { - if (lhs == NULL) return rhs == NULL; - - if (rhs == NULL) return false; - -#if GTEST_OS_WINDOWS - return _wcsicmp(lhs, rhs) == 0; -#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID - return wcscasecmp(lhs, rhs) == 0; -#else - // Android, Mac OS X and Cygwin don't define wcscasecmp. - // Other unknown OSes may not define it either. - wint_t left, right; - do { - left = towlower(*lhs++); - right = towlower(*rhs++); - } while (left && left == right); - return left == right; -#endif // OS selector -} - -// Returns true iff str ends with the given suffix, ignoring case. -// Any string is considered to end with an empty suffix. -bool String::EndsWithCaseInsensitive( - const std::string& str, const std::string& suffix) { - const size_t str_len = str.length(); - const size_t suffix_len = suffix.length(); - return (str_len >= suffix_len) && - CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len, - suffix.c_str()); -} - -// Formats an int value as "%02d". -std::string String::FormatIntWidth2(int value) { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << value; - return ss.str(); -} - -// Formats an int value as "%X". -std::string String::FormatHexInt(int value) { - std::stringstream ss; - ss << std::hex << std::uppercase << value; - return ss.str(); -} - -// Formats a byte as "%02X". -std::string String::FormatByte(unsigned char value) { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase - << static_cast(value); - return ss.str(); -} - -// Converts the buffer in a stringstream to an std::string, converting NUL -// bytes to "\\0" along the way. -std::string StringStreamToString(::std::stringstream* ss) { - const ::std::string& str = ss->str(); - const char* const start = str.c_str(); - const char* const end = start + str.length(); - - std::string result; - result.reserve(2 * (end - start)); - for (const char* ch = start; ch != end; ++ch) { - if (*ch == '\0') { - result += "\\0"; // Replaces NUL with "\\0"; - } else { - result += *ch; - } - } - - return result; -} - -// Appends the user-supplied message to the Google-Test-generated message. -std::string AppendUserMessage(const std::string& gtest_msg, - const Message& user_msg) { - // Appends the user message if it's non-empty. - const std::string user_msg_string = user_msg.GetString(); - if (user_msg_string.empty()) { - return gtest_msg; - } - - return gtest_msg + "\n" + user_msg_string; -} - -} // namespace internal - -// class TestResult - -// Creates an empty TestResult. -TestResult::TestResult() - : death_test_count_(0), - elapsed_time_(0) { -} - -// D'tor. -TestResult::~TestResult() { -} - -// Returns the i-th test part result among all the results. i can -// range from 0 to total_part_count() - 1. If i is not in that range, -// aborts the program. -const TestPartResult& TestResult::GetTestPartResult(int i) const { - if (i < 0 || i >= total_part_count()) - internal::posix::Abort(); - return test_part_results_.at(i); -} - -// Returns the i-th test property. i can range from 0 to -// test_property_count() - 1. If i is not in that range, aborts the -// program. -const TestProperty& TestResult::GetTestProperty(int i) const { - if (i < 0 || i >= test_property_count()) - internal::posix::Abort(); - return test_properties_.at(i); -} - -// Clears the test part results. -void TestResult::ClearTestPartResults() { - test_part_results_.clear(); -} - -// Adds a test part result to the list. -void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { - test_part_results_.push_back(test_part_result); -} - -// Adds a test property to the list. If a property with the same key as the -// supplied property is already represented, the value of this test_property -// replaces the old value for that key. -void TestResult::RecordProperty(const std::string& xml_element, - const TestProperty& test_property) { - if (!ValidateTestProperty(xml_element, test_property)) { - return; - } - internal::MutexLock lock(&test_properites_mutex_); - const std::vector::iterator property_with_matching_key = - std::find_if(test_properties_.begin(), test_properties_.end(), - internal::TestPropertyKeyIs(test_property.key())); - if (property_with_matching_key == test_properties_.end()) { - test_properties_.push_back(test_property); - return; - } - property_with_matching_key->SetValue(test_property.value()); -} - -// The list of reserved attributes used in the element of XML -// output. -static const char* const kReservedTestSuitesAttributes[] = { - "disabled", - "errors", - "failures", - "name", - "random_seed", - "tests", - "time", - "timestamp" -}; - -// The list of reserved attributes used in the element of XML -// output. -static const char* const kReservedTestSuiteAttributes[] = { - "disabled", - "errors", - "failures", - "name", - "tests", - "time" -}; - -// The list of reserved attributes used in the element of XML output. -static const char* const kReservedTestCaseAttributes[] = { - "classname", - "name", - "status", - "time", - "type_param", - "value_param" -}; - -template -std::vector ArrayAsVector(const char* const (&array)[kSize]) { - return std::vector(array, array + kSize); -} - -static std::vector GetReservedAttributesForElement( - const std::string& xml_element) { - if (xml_element == "testsuites") { - return ArrayAsVector(kReservedTestSuitesAttributes); - } else if (xml_element == "testsuite") { - return ArrayAsVector(kReservedTestSuiteAttributes); - } else if (xml_element == "testcase") { - return ArrayAsVector(kReservedTestCaseAttributes); - } else { - GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; - } - // This code is unreachable but some compilers may not realizes that. - return std::vector(); -} - -static std::string FormatWordList(const std::vector& words) { - Message word_list; - for (size_t i = 0; i < words.size(); ++i) { - if (i > 0 && words.size() > 2) { - word_list << ", "; - } - if (i == words.size() - 1) { - word_list << "and "; - } - word_list << "'" << words[i] << "'"; - } - return word_list.GetString(); -} - -bool ValidateTestPropertyName(const std::string& property_name, - const std::vector& reserved_names) { - if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != - reserved_names.end()) { - ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name - << " (" << FormatWordList(reserved_names) - << " are reserved by " << GTEST_NAME_ << ")"; - return false; - } - return true; -} - -// Adds a failure if the key is a reserved attribute of the element named -// xml_element. Returns true if the property is valid. -bool TestResult::ValidateTestProperty(const std::string& xml_element, - const TestProperty& test_property) { - return ValidateTestPropertyName(test_property.key(), - GetReservedAttributesForElement(xml_element)); -} - -// Clears the object. -void TestResult::Clear() { - test_part_results_.clear(); - test_properties_.clear(); - death_test_count_ = 0; - elapsed_time_ = 0; -} - -// Returns true iff the test failed. -bool TestResult::Failed() const { - for (int i = 0; i < total_part_count(); ++i) { - if (GetTestPartResult(i).failed()) - return true; - } - return false; -} - -// Returns true iff the test part fatally failed. -static bool TestPartFatallyFailed(const TestPartResult& result) { - return result.fatally_failed(); -} - -// Returns true iff the test fatally failed. -bool TestResult::HasFatalFailure() const { - return CountIf(test_part_results_, TestPartFatallyFailed) > 0; -} - -// Returns true iff the test part non-fatally failed. -static bool TestPartNonfatallyFailed(const TestPartResult& result) { - return result.nonfatally_failed(); -} - -// Returns true iff the test has a non-fatal failure. -bool TestResult::HasNonfatalFailure() const { - return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; -} - -// Gets the number of all test parts. This is the sum of the number -// of successful test parts and the number of failed test parts. -int TestResult::total_part_count() const { - return static_cast(test_part_results_.size()); -} - -// Returns the number of the test properties. -int TestResult::test_property_count() const { - return static_cast(test_properties_.size()); -} - -// class Test - -// Creates a Test object. - -// The c'tor saves the values of all Google Test flags. -Test::Test() - : gtest_flag_saver_(new internal::GTestFlagSaver) { -} - -// The d'tor restores the values of all Google Test flags. -Test::~Test() { - delete gtest_flag_saver_; -} - -// Sets up the test fixture. -// -// A sub-class may override this. -void Test::SetUp() { -} - -// Tears down the test fixture. -// -// A sub-class may override this. -void Test::TearDown() { -} - -// Allows user supplied key value pairs to be recorded for later output. -void Test::RecordProperty(const std::string& key, const std::string& value) { - UnitTest::GetInstance()->RecordProperty(key, value); -} - -// Allows user supplied key value pairs to be recorded for later output. -void Test::RecordProperty(const std::string& key, int value) { - Message value_message; - value_message << value; - RecordProperty(key, value_message.GetString().c_str()); -} - -namespace internal { - -void ReportFailureInUnknownLocation(TestPartResult::Type result_type, - const std::string& message) { - // This function is a friend of UnitTest and as such has access to - // AddTestPartResult. - UnitTest::GetInstance()->AddTestPartResult( - result_type, - NULL, // No info about the source file where the exception occurred. - -1, // We have no info on which line caused the exception. - message, - ""); // No stack trace, either. -} - -} // namespace internal - -// Google Test requires all tests in the same test case to use the same test -// fixture class. This function checks if the current test has the -// same fixture class as the first test in the current test case. If -// yes, it returns true; otherwise it generates a Google Test failure and -// returns false. -bool Test::HasSameFixtureClass() { - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - const TestCase* const test_case = impl->current_test_case(); - - // Info about the first test in the current test case. - const TestInfo* const first_test_info = test_case->test_info_list()[0]; - const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; - const char* const first_test_name = first_test_info->name(); - - // Info about the current test. - const TestInfo* const this_test_info = impl->current_test_info(); - const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_; - const char* const this_test_name = this_test_info->name(); - - if (this_fixture_id != first_fixture_id) { - // Is the first test defined using TEST? - const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId(); - // Is this test defined using TEST? - const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); - - if (first_is_TEST || this_is_TEST) { - // The user mixed TEST and TEST_F in this test case - we'll tell - // him/her how to fix it. - - // Gets the name of the TEST and the name of the TEST_F. Note - // that first_is_TEST and this_is_TEST cannot both be true, as - // the fixture IDs are different for the two tests. - const char* const TEST_name = - first_is_TEST ? first_test_name : this_test_name; - const char* const TEST_F_name = - first_is_TEST ? this_test_name : first_test_name; - - ADD_FAILURE() - << "All tests in the same test case must use the same test fixture\n" - << "class, so mixing TEST_F and TEST in the same test case is\n" - << "illegal. In test case " << this_test_info->test_case_name() - << ",\n" - << "test " << TEST_F_name << " is defined using TEST_F but\n" - << "test " << TEST_name << " is defined using TEST. You probably\n" - << "want to change the TEST to TEST_F or move it to another test\n" - << "case."; - } else { - // The user defined two fixture classes with the same name in - // two namespaces - we'll tell him/her how to fix it. - ADD_FAILURE() - << "All tests in the same test case must use the same test fixture\n" - << "class. However, in test case " - << this_test_info->test_case_name() << ",\n" - << "you defined test " << first_test_name - << " and test " << this_test_name << "\n" - << "using two different test fixture classes. This can happen if\n" - << "the two classes are from different namespaces or translation\n" - << "units and have the same name. You should probably rename one\n" - << "of the classes to put the tests into different test cases."; - } - return false; - } - - return true; -} - -#if GTEST_HAS_SEH - -// Adds an "exception thrown" fatal failure to the current test. This -// function returns its result via an output parameter pointer because VC++ -// prohibits creation of objects with destructors on stack in functions -// using __try (see error C2712). -static std::string* FormatSehExceptionMessage(DWORD exception_code, - const char* location) { - Message message; - message << "SEH exception with code 0x" << std::setbase(16) << - exception_code << std::setbase(10) << " thrown in " << location << "."; - - return new std::string(message.GetString()); -} - -#endif // GTEST_HAS_SEH - -namespace internal { - -#if GTEST_HAS_EXCEPTIONS - -// Adds an "exception thrown" fatal failure to the current test. -static std::string FormatCxxExceptionMessage(const char* description, - const char* location) { - Message message; - if (description != NULL) { - message << "C++ exception with description \"" << description << "\""; - } else { - message << "Unknown C++ exception"; - } - message << " thrown in " << location << "."; - - return message.GetString(); -} - -static std::string PrintTestPartResultToString( - const TestPartResult& test_part_result); - -GoogleTestFailureException::GoogleTestFailureException( - const TestPartResult& failure) - : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {} - -#endif // GTEST_HAS_EXCEPTIONS - -// We put these helper functions in the internal namespace as IBM's xlC -// compiler rejects the code if they were declared static. - -// Runs the given method and handles SEH exceptions it throws, when -// SEH is supported; returns the 0-value for type Result in case of an -// SEH exception. (Microsoft compilers cannot handle SEH and C++ -// exceptions in the same function. Therefore, we provide a separate -// wrapper function for handling SEH exceptions.) -template -Result HandleSehExceptionsInMethodIfSupported( - T* object, Result (T::*method)(), const char* location) { -#if GTEST_HAS_SEH - __try { - return (object->*method)(); - } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT - GetExceptionCode())) { - // We create the exception message on the heap because VC++ prohibits - // creation of objects with destructors on stack in functions using __try - // (see error C2712). - std::string* exception_message = FormatSehExceptionMessage( - GetExceptionCode(), location); - internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, - *exception_message); - delete exception_message; - return static_cast(0); - } -#else - (void)location; - return (object->*method)(); -#endif // GTEST_HAS_SEH -} - -// Runs the given method and catches and reports C++ and/or SEH-style -// exceptions, if they are supported; returns the 0-value for type -// Result in case of an SEH exception. -template -Result HandleExceptionsInMethodIfSupported( - T* object, Result (T::*method)(), const char* location) { - // NOTE: The user code can affect the way in which Google Test handles - // exceptions by setting GTEST_FLAG(catch_exceptions), but only before - // RUN_ALL_TESTS() starts. It is technically possible to check the flag - // after the exception is caught and either report or re-throw the - // exception based on the flag's value: - // - // try { - // // Perform the test method. - // } catch (...) { - // if (GTEST_FLAG(catch_exceptions)) - // // Report the exception as failure. - // else - // throw; // Re-throws the original exception. - // } - // - // However, the purpose of this flag is to allow the program to drop into - // the debugger when the exception is thrown. On most platforms, once the - // control enters the catch block, the exception origin information is - // lost and the debugger will stop the program at the point of the - // re-throw in this function -- instead of at the point of the original - // throw statement in the code under test. For this reason, we perform - // the check early, sacrificing the ability to affect Google Test's - // exception handling in the method where the exception is thrown. - if (internal::GetUnitTestImpl()->catch_exceptions()) { -#if GTEST_HAS_EXCEPTIONS - try { - return HandleSehExceptionsInMethodIfSupported(object, method, location); - } catch (const internal::GoogleTestFailureException&) { // NOLINT - // This exception type can only be thrown by a failed Google - // Test assertion with the intention of letting another testing - // framework catch it. Therefore we just re-throw it. - throw; - } catch (const std::exception& e) { // NOLINT - internal::ReportFailureInUnknownLocation( - TestPartResult::kFatalFailure, - FormatCxxExceptionMessage(e.what(), location)); - } catch (...) { // NOLINT - internal::ReportFailureInUnknownLocation( - TestPartResult::kFatalFailure, - FormatCxxExceptionMessage(NULL, location)); - } - return static_cast(0); -#else - return HandleSehExceptionsInMethodIfSupported(object, method, location); -#endif // GTEST_HAS_EXCEPTIONS - } else { - return (object->*method)(); - } -} - -} // namespace internal - -// Runs the test and updates the test result. -void Test::Run() { - if (!HasSameFixtureClass()) return; - - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); - // We will run the test only if SetUp() was successful. - if (!HasFatalFailure()) { - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &Test::TestBody, "the test body"); - } - - // However, we want to clean up as much as possible. Hence we will - // always call TearDown(), even if SetUp() or the test body has - // failed. - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &Test::TearDown, "TearDown()"); -} - -// Returns true iff the current test has a fatal failure. -bool Test::HasFatalFailure() { - return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); -} - -// Returns true iff the current test has a non-fatal failure. -bool Test::HasNonfatalFailure() { - return internal::GetUnitTestImpl()->current_test_result()-> - HasNonfatalFailure(); -} - -// class TestInfo - -// Constructs a TestInfo object. It assumes ownership of the test factory -// object. -TestInfo::TestInfo(const std::string& a_test_case_name, - const std::string& a_name, - const char* a_type_param, - const char* a_value_param, - internal::TypeId fixture_class_id, - internal::TestFactoryBase* factory) - : test_case_name_(a_test_case_name), - name_(a_name), - type_param_(a_type_param ? new std::string(a_type_param) : NULL), - value_param_(a_value_param ? new std::string(a_value_param) : NULL), - fixture_class_id_(fixture_class_id), - should_run_(false), - is_disabled_(false), - matches_filter_(false), - factory_(factory), - result_() {} - -// Destructs a TestInfo object. -TestInfo::~TestInfo() { delete factory_; } - -namespace internal { - -// Creates a new TestInfo object and registers it with Google Test; -// returns the created object. -// -// Arguments: -// -// test_case_name: name of the test case -// name: name of the test -// type_param: the name of the test's type parameter, or NULL if -// this is not a typed or a type-parameterized test. -// value_param: text representation of the test's value parameter, -// or NULL if this is not a value-parameterized test. -// fixture_class_id: ID of the test fixture class -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -// factory: pointer to the factory that creates a test object. -// The newly created TestInfo instance will assume -// ownership of the factory object. -TestInfo* MakeAndRegisterTestInfo( - const char* test_case_name, - const char* name, - const char* type_param, - const char* value_param, - TypeId fixture_class_id, - SetUpTestCaseFunc set_up_tc, - TearDownTestCaseFunc tear_down_tc, - TestFactoryBase* factory) { - TestInfo* const test_info = - new TestInfo(test_case_name, name, type_param, value_param, - fixture_class_id, factory); - GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); - return test_info; -} - -#if GTEST_HAS_PARAM_TEST -void ReportInvalidTestCaseType(const char* test_case_name, - const char* file, int line) { - Message errors; - errors - << "Attempted redefinition of test case " << test_case_name << ".\n" - << "All tests in the same test case must use the same test fixture\n" - << "class. However, in test case " << test_case_name << ", you tried\n" - << "to define a test using a fixture class different from the one\n" - << "used earlier. This can happen if the two fixture classes are\n" - << "from different namespaces and have the same name. You should\n" - << "probably rename one of the classes to put the tests into different\n" - << "test cases."; - - fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), - errors.GetString().c_str()); -} -#endif // GTEST_HAS_PARAM_TEST - -} // namespace internal - -namespace { - -// A predicate that checks the test name of a TestInfo against a known -// value. -// -// This is used for implementation of the TestCase class only. We put -// it in the anonymous namespace to prevent polluting the outer -// namespace. -// -// TestNameIs is copyable. -class TestNameIs { - public: - // Constructor. - // - // TestNameIs has NO default constructor. - explicit TestNameIs(const char* name) - : name_(name) {} - - // Returns true iff the test name of test_info matches name_. - bool operator()(const TestInfo * test_info) const { - return test_info && test_info->name() == name_; - } - - private: - std::string name_; -}; - -} // namespace - -namespace internal { - -// This method expands all parameterized tests registered with macros TEST_P -// and INSTANTIATE_TEST_CASE_P into regular tests and registers those. -// This will be done just once during the program runtime. -void UnitTestImpl::RegisterParameterizedTests() { -#if GTEST_HAS_PARAM_TEST - if (!parameterized_tests_registered_) { - parameterized_test_registry_.RegisterTests(); - parameterized_tests_registered_ = true; - } -#endif -} - -} // namespace internal - -// Creates the test object, runs it, records its result, and then -// deletes it. -void TestInfo::Run() { - if (!should_run_) return; - - // Tells UnitTest where to store test result. - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - impl->set_current_test_info(this); - - TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); - - // Notifies the unit test event listeners that a test is about to start. - repeater->OnTestStart(*this); - - const TimeInMillis start = internal::GetTimeInMillis(); - - impl->os_stack_trace_getter()->UponLeavingGTest(); - - // Creates the test object. - Test* const test = internal::HandleExceptionsInMethodIfSupported( - factory_, &internal::TestFactoryBase::CreateTest, - "the test fixture's constructor"); - - // Runs the test only if the test object was created and its - // constructor didn't generate a fatal failure. - if ((test != NULL) && !Test::HasFatalFailure()) { - // This doesn't throw as all user code that can throw are wrapped into - // exception handling code. - test->Run(); - } - - // Deletes the test object. - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - test, &Test::DeleteSelf_, "the test fixture's destructor"); - - result_.set_elapsed_time(internal::GetTimeInMillis() - start); - - // Notifies the unit test event listener that a test has just finished. - repeater->OnTestEnd(*this); - - // Tells UnitTest to stop associating assertion results to this - // test. - impl->set_current_test_info(NULL); -} - -// class TestCase - -// Gets the number of successful tests in this test case. -int TestCase::successful_test_count() const { - return CountIf(test_info_list_, TestPassed); -} - -// Gets the number of failed tests in this test case. -int TestCase::failed_test_count() const { - return CountIf(test_info_list_, TestFailed); -} - -// Gets the number of disabled tests that will be reported in the XML report. -int TestCase::reportable_disabled_test_count() const { - return CountIf(test_info_list_, TestReportableDisabled); -} - -// Gets the number of disabled tests in this test case. -int TestCase::disabled_test_count() const { - return CountIf(test_info_list_, TestDisabled); -} - -// Gets the number of tests to be printed in the XML report. -int TestCase::reportable_test_count() const { - return CountIf(test_info_list_, TestReportable); -} - -// Get the number of tests in this test case that should run. -int TestCase::test_to_run_count() const { - return CountIf(test_info_list_, ShouldRunTest); -} - -// Gets the number of all tests. -int TestCase::total_test_count() const { - return static_cast(test_info_list_.size()); -} - -// Creates a TestCase with the given name. -// -// Arguments: -// -// name: name of the test case -// a_type_param: the name of the test case's type parameter, or NULL if -// this is not a typed or a type-parameterized test case. -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -TestCase::TestCase(const char* a_name, const char* a_type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc) - : name_(a_name), - type_param_(a_type_param ? new std::string(a_type_param) : NULL), - set_up_tc_(set_up_tc), - tear_down_tc_(tear_down_tc), - should_run_(false), - elapsed_time_(0) { -} - -// Destructor of TestCase. -TestCase::~TestCase() { - // Deletes every Test in the collection. - ForEach(test_info_list_, internal::Delete); -} - -// Returns the i-th test among all the tests. i can range from 0 to -// total_test_count() - 1. If i is not in that range, returns NULL. -const TestInfo* TestCase::GetTestInfo(int i) const { - const int index = GetElementOr(test_indices_, i, -1); - return index < 0 ? NULL : test_info_list_[index]; -} - -// Returns the i-th test among all the tests. i can range from 0 to -// total_test_count() - 1. If i is not in that range, returns NULL. -TestInfo* TestCase::GetMutableTestInfo(int i) { - const int index = GetElementOr(test_indices_, i, -1); - return index < 0 ? NULL : test_info_list_[index]; -} - -// Adds a test to this test case. Will delete the test upon -// destruction of the TestCase object. -void TestCase::AddTestInfo(TestInfo * test_info) { - test_info_list_.push_back(test_info); - test_indices_.push_back(static_cast(test_indices_.size())); -} - -// Runs every test in this TestCase. -void TestCase::Run() { - if (!should_run_) return; - - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - impl->set_current_test_case(this); - - TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); - - repeater->OnTestCaseStart(*this); - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &TestCase::RunSetUpTestCase, "SetUpTestCase()"); - - const internal::TimeInMillis start = internal::GetTimeInMillis(); - for (int i = 0; i < total_test_count(); i++) { - GetMutableTestInfo(i)->Run(); - } - elapsed_time_ = internal::GetTimeInMillis() - start; - - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &TestCase::RunTearDownTestCase, "TearDownTestCase()"); - - repeater->OnTestCaseEnd(*this); - impl->set_current_test_case(NULL); -} - -// Clears the results of all tests in this test case. -void TestCase::ClearResult() { - ad_hoc_test_result_.Clear(); - ForEach(test_info_list_, TestInfo::ClearTestResult); -} - -// Shuffles the tests in this test case. -void TestCase::ShuffleTests(internal::Random* random) { - Shuffle(random, &test_indices_); -} - -// Restores the test order to before the first shuffle. -void TestCase::UnshuffleTests() { - for (size_t i = 0; i < test_indices_.size(); i++) { - test_indices_[i] = static_cast(i); - } -} - -// Formats a countable noun. Depending on its quantity, either the -// singular form or the plural form is used. e.g. -// -// FormatCountableNoun(1, "formula", "formuli") returns "1 formula". -// FormatCountableNoun(5, "book", "books") returns "5 books". -static std::string FormatCountableNoun(int count, - const char * singular_form, - const char * plural_form) { - return internal::StreamableToString(count) + " " + - (count == 1 ? singular_form : plural_form); -} - -// Formats the count of tests. -static std::string FormatTestCount(int test_count) { - return FormatCountableNoun(test_count, "test", "tests"); -} - -// Formats the count of test cases. -static std::string FormatTestCaseCount(int test_case_count) { - return FormatCountableNoun(test_case_count, "test case", "test cases"); -} - -// Converts a TestPartResult::Type enum to human-friendly string -// representation. Both kNonFatalFailure and kFatalFailure are translated -// to "Failure", as the user usually doesn't care about the difference -// between the two when viewing the test result. -static const char * TestPartResultTypeToString(TestPartResult::Type type) { - switch (type) { - case TestPartResult::kSuccess: - return "Success"; - - case TestPartResult::kNonFatalFailure: - case TestPartResult::kFatalFailure: -#ifdef _MSC_VER - return "error: "; -#else - return "Failure\n"; -#endif - default: - return "Unknown result type"; - } -} - -namespace internal { - -// Prints a TestPartResult to an std::string. -static std::string PrintTestPartResultToString( - const TestPartResult& test_part_result) { - return (Message() - << internal::FormatFileLocation(test_part_result.file_name(), - test_part_result.line_number()) - << " " << TestPartResultTypeToString(test_part_result.type()) - << test_part_result.message()).GetString(); -} - -// Prints a TestPartResult. -static void PrintTestPartResult(const TestPartResult& test_part_result) { - const std::string& result = - PrintTestPartResultToString(test_part_result); - printf("%s\n", result.c_str()); - fflush(stdout); - // If the test program runs in Visual Studio or a debugger, the - // following statements add the test part result message to the Output - // window such that the user can double-click on it to jump to the - // corresponding source code location; otherwise they do nothing. -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - // We don't call OutputDebugString*() on Windows Mobile, as printing - // to stdout is done by OutputDebugString() there already - we don't - // want the same message printed twice. - ::OutputDebugStringA(result.c_str()); - ::OutputDebugStringA("\n"); -#endif -} - -// class PrettyUnitTestResultPrinter - -enum GTestColor { - COLOR_DEFAULT, - COLOR_RED, - COLOR_GREEN, - COLOR_YELLOW -}; - -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - -// Returns the character attribute for the given color. -WORD GetColorAttribute(GTestColor color) { - switch (color) { - case COLOR_RED: return FOREGROUND_RED; - case COLOR_GREEN: return FOREGROUND_GREEN; - case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN; - default: return 0; - } -} - -#else - -// Returns the ANSI color code for the given color. COLOR_DEFAULT is -// an invalid input. -const char* GetAnsiColorCode(GTestColor color) { - switch (color) { - case COLOR_RED: return "1"; - case COLOR_GREEN: return "2"; - case COLOR_YELLOW: return "3"; - default: return NULL; - }; -} - -#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - -// Returns true iff Google Test should use colors in the output. -bool ShouldUseColor(bool stdout_is_tty) { - const char* const gtest_color = GTEST_FLAG(color).c_str(); - - if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { -#if GTEST_OS_WINDOWS - // On Windows the TERM variable is usually not set, but the - // console there does support colors. - return stdout_is_tty; -#else - // On non-Windows platforms, we rely on the TERM variable. - const char* const term = posix::GetEnv("TERM"); - const bool term_supports_color = - String::CStringEquals(term, "xterm") || - String::CStringEquals(term, "xterm-color") || - String::CStringEquals(term, "xterm-256color") || - String::CStringEquals(term, "screen") || - String::CStringEquals(term, "screen-256color") || - String::CStringEquals(term, "linux") || - String::CStringEquals(term, "cygwin"); - return stdout_is_tty && term_supports_color; -#endif // GTEST_OS_WINDOWS - } - - return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || - String::CaseInsensitiveCStringEquals(gtest_color, "true") || - String::CaseInsensitiveCStringEquals(gtest_color, "t") || - String::CStringEquals(gtest_color, "1"); - // We take "yes", "true", "t", and "1" as meaning "yes". If the - // value is neither one of these nor "auto", we treat it as "no" to - // be conservative. -} - -// Helpers for printing colored strings to stdout. Note that on Windows, we -// cannot simply emit special characters and have the terminal change colors. -// This routine must actually emit the characters rather than return a string -// that would be colored when printed, as can be done on Linux. -void ColoredPrintf(GTestColor color, const char* fmt, ...) { - va_list args; - va_start(args, fmt); - -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || GTEST_OS_IOS - const bool use_color = false; -#else - static const bool in_color_mode = - ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); - const bool use_color = in_color_mode && (color != COLOR_DEFAULT); -#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS - // The '!= 0' comparison is necessary to satisfy MSVC 7.1. - - if (!use_color) { - vprintf(fmt, args); - va_end(args); - return; - } - -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); - - // Gets the current text color. - CONSOLE_SCREEN_BUFFER_INFO buffer_info; - GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); - const WORD old_color_attrs = buffer_info.wAttributes; - - // We need to flush the stream buffers into the console before each - // SetConsoleTextAttribute call lest it affect the text that is already - // printed but has not yet reached the console. - fflush(stdout); - SetConsoleTextAttribute(stdout_handle, - GetColorAttribute(color) | FOREGROUND_INTENSITY); - vprintf(fmt, args); - - fflush(stdout); - // Restores the text color. - SetConsoleTextAttribute(stdout_handle, old_color_attrs); -#else - printf("\033[0;3%sm", GetAnsiColorCode(color)); - vprintf(fmt, args); - printf("\033[m"); // Resets the terminal to default. -#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - va_end(args); -} - -// Text printed in Google Test's text output and --gunit_list_tests -// output to label the type parameter and value parameter for a test. -static const char kTypeParamLabel[] = "TypeParam"; -static const char kValueParamLabel[] = "GetParam()"; - -void PrintFullTestCommentIfPresent(const TestInfo& test_info) { - const char* const type_param = test_info.type_param(); - const char* const value_param = test_info.value_param(); - - if (type_param != NULL || value_param != NULL) { - printf(", where "); - if (type_param != NULL) { - printf("%s = %s", kTypeParamLabel, type_param); - if (value_param != NULL) - printf(" and "); - } - if (value_param != NULL) { - printf("%s = %s", kValueParamLabel, value_param); - } - } -} - -// This class implements the TestEventListener interface. -// -// Class PrettyUnitTestResultPrinter is copyable. -class PrettyUnitTestResultPrinter : public TestEventListener { - public: - PrettyUnitTestResultPrinter() {} - static void PrintTestName(const char * test_case, const char * test) { - printf("%s.%s", test_case, test); - } - - // The following methods override what's in the TestEventListener class. - virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); - virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); - virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestCaseStart(const TestCase& test_case); - virtual void OnTestStart(const TestInfo& test_info); - virtual void OnTestPartResult(const TestPartResult& result); - virtual void OnTestEnd(const TestInfo& test_info); - virtual void OnTestCaseEnd(const TestCase& test_case); - virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); - virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} - - private: - static void PrintFailedTests(const UnitTest& unit_test); -}; - - // Fired before each iteration of tests starts. -void PrettyUnitTestResultPrinter::OnTestIterationStart( - const UnitTest& unit_test, int iteration) { - if (GTEST_FLAG(repeat) != 1) - printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); - - const char* const filter = GTEST_FLAG(filter).c_str(); - - // Prints the filter if it's not *. This reminds the user that some - // tests may be skipped. - if (!String::CStringEquals(filter, kUniversalFilter)) { - ColoredPrintf(COLOR_YELLOW, - "Note: %s filter = %s\n", GTEST_NAME_, filter); - } - - if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { - const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); - ColoredPrintf(COLOR_YELLOW, - "Note: This is test shard %d of %s.\n", - static_cast(shard_index) + 1, - internal::posix::GetEnv(kTestTotalShards)); - } - - if (GTEST_FLAG(shuffle)) { - ColoredPrintf(COLOR_YELLOW, - "Note: Randomizing tests' orders with a seed of %d .\n", - unit_test.random_seed()); - } - - ColoredPrintf(COLOR_GREEN, "[==========] "); - printf("Running %s from %s.\n", - FormatTestCount(unit_test.test_to_run_count()).c_str(), - FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( - const UnitTest& /*unit_test*/) { - ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("Global test environment set-up.\n"); - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { - const std::string counts = - FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); - ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("%s from %s", counts.c_str(), test_case.name()); - if (test_case.type_param() == NULL) { - printf("\n"); - } else { - printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param()); - } - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { - ColoredPrintf(COLOR_GREEN, "[ RUN ] "); - PrintTestName(test_info.test_case_name(), test_info.name()); - printf("\n"); - fflush(stdout); -} - -// Called after an assertion failure. -void PrettyUnitTestResultPrinter::OnTestPartResult( - const TestPartResult& result) { - // If the test part succeeded, we don't need to do anything. - if (result.type() == TestPartResult::kSuccess) - return; - - // Print failure message from the assertion (e.g. expected this and got that). - PrintTestPartResult(result); - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { - if (test_info.result()->Passed()) { - ColoredPrintf(COLOR_GREEN, "[ OK ] "); - } else { - ColoredPrintf(COLOR_RED, "[ FAILED ] "); - } - PrintTestName(test_info.test_case_name(), test_info.name()); - if (test_info.result()->Failed()) - PrintFullTestCommentIfPresent(test_info); - - if (GTEST_FLAG(print_time)) { - printf(" (%s ms)\n", internal::StreamableToString( - test_info.result()->elapsed_time()).c_str()); - } else { - printf("\n"); - } - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { - if (!GTEST_FLAG(print_time)) return; - - const std::string counts = - FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); - ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("%s from %s (%s ms total)\n\n", - counts.c_str(), test_case.name(), - internal::StreamableToString(test_case.elapsed_time()).c_str()); - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( - const UnitTest& /*unit_test*/) { - ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("Global test environment tear-down\n"); - fflush(stdout); -} - -// Internal helper for printing the list of failed tests. -void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { - const int failed_test_count = unit_test.failed_test_count(); - if (failed_test_count == 0) { - return; - } - - for (int i = 0; i < unit_test.total_test_case_count(); ++i) { - const TestCase& test_case = *unit_test.GetTestCase(i); - if (!test_case.should_run() || (test_case.failed_test_count() == 0)) { - continue; - } - for (int j = 0; j < test_case.total_test_count(); ++j) { - const TestInfo& test_info = *test_case.GetTestInfo(j); - if (!test_info.should_run() || test_info.result()->Passed()) { - continue; - } - ColoredPrintf(COLOR_RED, "[ FAILED ] "); - printf("%s.%s", test_case.name(), test_info.name()); - PrintFullTestCommentIfPresent(test_info); - printf("\n"); - } - } -} - -void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, - int /*iteration*/) { - ColoredPrintf(COLOR_GREEN, "[==========] "); - printf("%s from %s ran.", - FormatTestCount(unit_test.test_to_run_count()).c_str(), - FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); - if (GTEST_FLAG(print_time)) { - printf(" (%s ms total)", - internal::StreamableToString(unit_test.elapsed_time()).c_str()); - } - printf("\n"); - ColoredPrintf(COLOR_GREEN, "[ PASSED ] "); - printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); - - int num_failures = unit_test.failed_test_count(); - if (!unit_test.Passed()) { - const int failed_test_count = unit_test.failed_test_count(); - ColoredPrintf(COLOR_RED, "[ FAILED ] "); - printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); - PrintFailedTests(unit_test); - printf("\n%2d FAILED %s\n", num_failures, - num_failures == 1 ? "TEST" : "TESTS"); - } - - int num_disabled = unit_test.reportable_disabled_test_count(); - if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { - if (!num_failures) { - printf("\n"); // Add a spacer if no FAILURE banner is displayed. - } - ColoredPrintf(COLOR_YELLOW, - " YOU HAVE %d DISABLED %s\n\n", - num_disabled, - num_disabled == 1 ? "TEST" : "TESTS"); - } - // Ensure that Google Test output is printed before, e.g., heapchecker output. - fflush(stdout); -} - -// End PrettyUnitTestResultPrinter - -// class TestEventRepeater -// -// This class forwards events to other event listeners. -class TestEventRepeater : public TestEventListener { - public: - TestEventRepeater() : forwarding_enabled_(true) {} - virtual ~TestEventRepeater(); - void Append(TestEventListener *listener); - TestEventListener* Release(TestEventListener* listener); - - // Controls whether events will be forwarded to listeners_. Set to false - // in death test child processes. - bool forwarding_enabled() const { return forwarding_enabled_; } - void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } - - virtual void OnTestProgramStart(const UnitTest& unit_test); - virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); - virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); - virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test); - virtual void OnTestCaseStart(const TestCase& test_case); - virtual void OnTestStart(const TestInfo& test_info); - virtual void OnTestPartResult(const TestPartResult& result); - virtual void OnTestEnd(const TestInfo& test_info); - virtual void OnTestCaseEnd(const TestCase& test_case); - virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); - virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test); - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - virtual void OnTestProgramEnd(const UnitTest& unit_test); - - private: - // Controls whether events will be forwarded to listeners_. Set to false - // in death test child processes. - bool forwarding_enabled_; - // The list of listeners that receive events. - std::vector listeners_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater); -}; - -TestEventRepeater::~TestEventRepeater() { - ForEach(listeners_, Delete); -} - -void TestEventRepeater::Append(TestEventListener *listener) { - listeners_.push_back(listener); -} - -// TODO(vladl@google.com): Factor the search functionality into Vector::Find. -TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { - for (size_t i = 0; i < listeners_.size(); ++i) { - if (listeners_[i] == listener) { - listeners_.erase(listeners_.begin() + i); - return listener; - } - } - - return NULL; -} - -// Since most methods are very similar, use macros to reduce boilerplate. -// This defines a member that forwards the call to all listeners. -#define GTEST_REPEATER_METHOD_(Name, Type) \ -void TestEventRepeater::Name(const Type& parameter) { \ - if (forwarding_enabled_) { \ - for (size_t i = 0; i < listeners_.size(); i++) { \ - listeners_[i]->Name(parameter); \ - } \ - } \ -} -// This defines a member that forwards the call to all listeners in reverse -// order. -#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ -void TestEventRepeater::Name(const Type& parameter) { \ - if (forwarding_enabled_) { \ - for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { \ - listeners_[i]->Name(parameter); \ - } \ - } \ -} - -GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) -GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) -GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase) -GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) -GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) -GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) -GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) -GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) -GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) -GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase) -GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) - -#undef GTEST_REPEATER_METHOD_ -#undef GTEST_REVERSE_REPEATER_METHOD_ - -void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test, - int iteration) { - if (forwarding_enabled_) { - for (size_t i = 0; i < listeners_.size(); i++) { - listeners_[i]->OnTestIterationStart(unit_test, iteration); - } - } -} - -void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test, - int iteration) { - if (forwarding_enabled_) { - for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { - listeners_[i]->OnTestIterationEnd(unit_test, iteration); - } - } -} - -// End TestEventRepeater - -// This class generates an XML output file. -class XmlUnitTestResultPrinter : public EmptyTestEventListener { - public: - explicit XmlUnitTestResultPrinter(const char* output_file); - - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - - private: - // Is c a whitespace character that is normalized to a space character - // when it appears in an XML attribute value? - static bool IsNormalizableWhitespace(char c) { - return c == 0x9 || c == 0xA || c == 0xD; - } - - // May c appear in a well-formed XML document? - static bool IsValidXmlCharacter(char c) { - return IsNormalizableWhitespace(c) || c >= 0x20; - } - - // Returns an XML-escaped copy of the input string str. If - // is_attribute is true, the text is meant to appear as an attribute - // value, and normalizable whitespace is preserved by replacing it - // with character references. - static std::string EscapeXml(const std::string& str, bool is_attribute); - - // Returns the given string with all characters invalid in XML removed. - static std::string RemoveInvalidXmlCharacters(const std::string& str); - - // Convenience wrapper around EscapeXml when str is an attribute value. - static std::string EscapeXmlAttribute(const std::string& str) { - return EscapeXml(str, true); - } - - // Convenience wrapper around EscapeXml when str is not an attribute value. - static std::string EscapeXmlText(const char* str) { - return EscapeXml(str, false); - } - - // Verifies that the given attribute belongs to the given element and - // streams the attribute as XML. - static void OutputXmlAttribute(std::ostream* stream, - const std::string& element_name, - const std::string& name, - const std::string& value); - - // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. - static void OutputXmlCDataSection(::std::ostream* stream, const char* data); - - // Streams an XML representation of a TestInfo object. - static void OutputXmlTestInfo(::std::ostream* stream, - const char* test_case_name, - const TestInfo& test_info); - - // Prints an XML representation of a TestCase object - static void PrintXmlTestCase(::std::ostream* stream, - const TestCase& test_case); - - // Prints an XML summary of unit_test to output stream out. - static void PrintXmlUnitTest(::std::ostream* stream, - const UnitTest& unit_test); - - // Produces a string representing the test properties in a result as space - // delimited XML attributes based on the property key="value" pairs. - // When the std::string is not empty, it includes a space at the beginning, - // to delimit this attribute from prior attributes. - static std::string TestPropertiesAsXmlAttributes(const TestResult& result); - - // The output file. - const std::string output_file_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter); -}; - -// Creates a new XmlUnitTestResultPrinter. -XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file) - : output_file_(output_file) { - if (output_file_.c_str() == NULL || output_file_.empty()) { - fprintf(stderr, "XML output file may not be null\n"); - fflush(stderr); - exit(EXIT_FAILURE); - } -} - -// Called after the unit test ends. -void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, - int /*iteration*/) { - FILE* xmlout = NULL; - FilePath output_file(output_file_); - FilePath output_dir(output_file.RemoveFileName()); - - if (output_dir.CreateDirectoriesRecursively()) { - xmlout = posix::FOpen(output_file_.c_str(), "w"); - } - if (xmlout == NULL) { - // TODO(wan): report the reason of the failure. - // - // We don't do it for now as: - // - // 1. There is no urgent need for it. - // 2. It's a bit involved to make the errno variable thread-safe on - // all three operating systems (Linux, Windows, and Mac OS). - // 3. To interpret the meaning of errno in a thread-safe way, - // we need the strerror_r() function, which is not available on - // Windows. - fprintf(stderr, - "Unable to open file \"%s\"\n", - output_file_.c_str()); - fflush(stderr); - exit(EXIT_FAILURE); - } - std::stringstream stream; - PrintXmlUnitTest(&stream, unit_test); - fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); - fclose(xmlout); -} - -// Returns an XML-escaped copy of the input string str. If is_attribute -// is true, the text is meant to appear as an attribute value, and -// normalizable whitespace is preserved by replacing it with character -// references. -// -// Invalid XML characters in str, if any, are stripped from the output. -// It is expected that most, if not all, of the text processed by this -// module will consist of ordinary English text. -// If this module is ever modified to produce version 1.1 XML output, -// most invalid characters can be retained using character references. -// TODO(wan): It might be nice to have a minimally invasive, human-readable -// escaping scheme for invalid characters, rather than dropping them. -std::string XmlUnitTestResultPrinter::EscapeXml( - const std::string& str, bool is_attribute) { - Message m; - - for (size_t i = 0; i < str.size(); ++i) { - const char ch = str[i]; - switch (ch) { - case '<': - m << "<"; - break; - case '>': - m << ">"; - break; - case '&': - m << "&"; - break; - case '\'': - if (is_attribute) - m << "'"; - else - m << '\''; - break; - case '"': - if (is_attribute) - m << """; - else - m << '"'; - break; - default: - if (IsValidXmlCharacter(ch)) { - if (is_attribute && IsNormalizableWhitespace(ch)) - m << "&#x" << String::FormatByte(static_cast(ch)) - << ";"; - else - m << ch; - } - break; - } - } - - return m.GetString(); -} - -// Returns the given string with all characters invalid in XML removed. -// Currently invalid characters are dropped from the string. An -// alternative is to replace them with certain characters such as . or ?. -std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( - const std::string& str) { - std::string output; - output.reserve(str.size()); - for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) - if (IsValidXmlCharacter(*it)) - output.push_back(*it); - - return output; -} - -// The following routines generate an XML representation of a UnitTest -// object. -// -// This is how Google Test concepts map to the DTD: -// -// <-- corresponds to a UnitTest object -// <-- corresponds to a TestCase object -// <-- corresponds to a TestInfo object -// ... -// ... -// ... -// <-- individual assertion failures -// -// -// - -// Formats the given time in milliseconds as seconds. -std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) { - ::std::stringstream ss; - ss << ms/1000.0; - return ss.str(); -} - -// Converts the given epoch time in milliseconds to a date string in the ISO -// 8601 format, without the timezone information. -std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { - // Using non-reentrant version as localtime_r is not portable. - time_t seconds = static_cast(ms / 1000); -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4996) // Temporarily disables warning 4996 - // (function or variable may be unsafe). - const struct tm* const time_struct = localtime(&seconds); // NOLINT -# pragma warning(pop) // Restores the warning state again. -#else - const struct tm* const time_struct = localtime(&seconds); // NOLINT -#endif - if (time_struct == NULL) - return ""; // Invalid ms value - - // YYYY-MM-DDThh:mm:ss - return StreamableToString(time_struct->tm_year + 1900) + "-" + - String::FormatIntWidth2(time_struct->tm_mon + 1) + "-" + - String::FormatIntWidth2(time_struct->tm_mday) + "T" + - String::FormatIntWidth2(time_struct->tm_hour) + ":" + - String::FormatIntWidth2(time_struct->tm_min) + ":" + - String::FormatIntWidth2(time_struct->tm_sec); -} - -// Streams an XML CDATA section, escaping invalid CDATA sequences as needed. -void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, - const char* data) { - const char* segment = data; - *stream << ""); - if (next_segment != NULL) { - stream->write( - segment, static_cast(next_segment - segment)); - *stream << "]]>]]>"); - } else { - *stream << segment; - break; - } - } - *stream << "]]>"; -} - -void XmlUnitTestResultPrinter::OutputXmlAttribute( - std::ostream* stream, - const std::string& element_name, - const std::string& name, - const std::string& value) { - const std::vector& allowed_names = - GetReservedAttributesForElement(element_name); - - GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != - allowed_names.end()) - << "Attribute " << name << " is not allowed for element <" << element_name - << ">."; - - *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\""; -} - -// Prints an XML representation of a TestInfo object. -// TODO(wan): There is also value in printing properties with the plain printer. -void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, - const char* test_case_name, - const TestInfo& test_info) { - const TestResult& result = *test_info.result(); - const std::string kTestcase = "testcase"; - - *stream << " \n"; - } - const string location = internal::FormatCompilerIndependentFileLocation( - part.file_name(), part.line_number()); - const string summary = location + "\n" + part.summary(); - *stream << " "; - const string detail = location + "\n" + part.message(); - OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); - *stream << "\n"; - } - } - - if (failures == 0) - *stream << " />\n"; - else - *stream << " \n"; -} - -// Prints an XML representation of a TestCase object -void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream, - const TestCase& test_case) { - const std::string kTestsuite = "testsuite"; - *stream << " <" << kTestsuite; - OutputXmlAttribute(stream, kTestsuite, "name", test_case.name()); - OutputXmlAttribute(stream, kTestsuite, "tests", - StreamableToString(test_case.reportable_test_count())); - OutputXmlAttribute(stream, kTestsuite, "failures", - StreamableToString(test_case.failed_test_count())); - OutputXmlAttribute( - stream, kTestsuite, "disabled", - StreamableToString(test_case.reportable_disabled_test_count())); - OutputXmlAttribute(stream, kTestsuite, "errors", "0"); - OutputXmlAttribute(stream, kTestsuite, "time", - FormatTimeInMillisAsSeconds(test_case.elapsed_time())); - *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result()) - << ">\n"; - - for (int i = 0; i < test_case.total_test_count(); ++i) { - if (test_case.GetTestInfo(i)->is_reportable()) - OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i)); - } - *stream << " \n"; -} - -// Prints an XML summary of unit_test to output stream out. -void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, - const UnitTest& unit_test) { - const std::string kTestsuites = "testsuites"; - - *stream << "\n"; - *stream << "<" << kTestsuites; - - OutputXmlAttribute(stream, kTestsuites, "tests", - StreamableToString(unit_test.reportable_test_count())); - OutputXmlAttribute(stream, kTestsuites, "failures", - StreamableToString(unit_test.failed_test_count())); - OutputXmlAttribute( - stream, kTestsuites, "disabled", - StreamableToString(unit_test.reportable_disabled_test_count())); - OutputXmlAttribute(stream, kTestsuites, "errors", "0"); - OutputXmlAttribute( - stream, kTestsuites, "timestamp", - FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); - OutputXmlAttribute(stream, kTestsuites, "time", - FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); - - if (GTEST_FLAG(shuffle)) { - OutputXmlAttribute(stream, kTestsuites, "random_seed", - StreamableToString(unit_test.random_seed())); - } - - *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result()); - - OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); - *stream << ">\n"; - - for (int i = 0; i < unit_test.total_test_case_count(); ++i) { - if (unit_test.GetTestCase(i)->reportable_test_count() > 0) - PrintXmlTestCase(stream, *unit_test.GetTestCase(i)); - } - *stream << "\n"; -} - -// Produces a string representing the test properties in a result as space -// delimited XML attributes based on the property key="value" pairs. -std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( - const TestResult& result) { - Message attributes; - for (int i = 0; i < result.test_property_count(); ++i) { - const TestProperty& property = result.GetTestProperty(i); - attributes << " " << property.key() << "=" - << "\"" << EscapeXmlAttribute(property.value()) << "\""; - } - return attributes.GetString(); -} - -// End XmlUnitTestResultPrinter - -#if GTEST_CAN_STREAM_RESULTS_ - -// Checks if str contains '=', '&', '%' or '\n' characters. If yes, -// replaces them by "%xx" where xx is their hexadecimal value. For -// example, replaces "=" with "%3D". This algorithm is O(strlen(str)) -// in both time and space -- important as the input str may contain an -// arbitrarily long test failure message and stack trace. -string StreamingListener::UrlEncode(const char* str) { - string result; - result.reserve(strlen(str) + 1); - for (char ch = *str; ch != '\0'; ch = *++str) { - switch (ch) { - case '%': - case '=': - case '&': - case '\n': - result.append("%" + String::FormatByte(static_cast(ch))); - break; - default: - result.push_back(ch); - break; - } - } - return result; -} - -void StreamingListener::SocketWriter::MakeConnection() { - GTEST_CHECK_(sockfd_ == -1) - << "MakeConnection() can't be called when there is already a connection."; - - addrinfo hints; - memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. - hints.ai_socktype = SOCK_STREAM; - addrinfo* servinfo = NULL; - - // Use the getaddrinfo() to get a linked list of IP addresses for - // the given host name. - const int error_num = getaddrinfo( - host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); - if (error_num != 0) { - GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " - << gai_strerror(error_num); - } - - // Loop through all the results and connect to the first we can. - for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL; - cur_addr = cur_addr->ai_next) { - sockfd_ = socket( - cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol); - if (sockfd_ != -1) { - // Connect the client socket to the server socket. - if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { - close(sockfd_); - sockfd_ = -1; - } - } - } - - freeaddrinfo(servinfo); // all done with this structure - - if (sockfd_ == -1) { - GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to " - << host_name_ << ":" << port_num_; - } -} - -// End of class Streaming Listener -#endif // GTEST_CAN_STREAM_RESULTS__ - -// Class ScopedTrace - -// Pushes the given source file location and message onto a per-thread -// trace stack maintained by Google Test. -ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) - GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { - TraceInfo trace; - trace.file = file; - trace.line = line; - trace.message = message.GetString(); - - UnitTest::GetInstance()->PushGTestTrace(trace); -} - -// Pops the info pushed by the c'tor. -ScopedTrace::~ScopedTrace() - GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { - UnitTest::GetInstance()->PopGTestTrace(); -} - - -// class OsStackTraceGetter - -// Returns the current OS stack trace as an std::string. Parameters: -// -// max_depth - the maximum number of stack frames to be included -// in the trace. -// skip_count - the number of top frames to be skipped; doesn't count -// against max_depth. -// -string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */, - int /* skip_count */) - GTEST_LOCK_EXCLUDED_(mutex_) { - return ""; -} - -void OsStackTraceGetter::UponLeavingGTest() - GTEST_LOCK_EXCLUDED_(mutex_) { -} - -const char* const -OsStackTraceGetter::kElidedFramesMarker = - "... " GTEST_NAME_ " internal frames ..."; - -// A helper class that creates the premature-exit file in its -// constructor and deletes the file in its destructor. -class ScopedPrematureExitFile { - public: - explicit ScopedPrematureExitFile(const char* premature_exit_filepath) - : premature_exit_filepath_(premature_exit_filepath) { - // If a path to the premature-exit file is specified... - if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') { - // create the file with a single "0" character in it. I/O - // errors are ignored as there's nothing better we can do and we - // don't want to fail the test because of this. - FILE* pfile = posix::FOpen(premature_exit_filepath, "w"); - fwrite("0", 1, 1, pfile); - fclose(pfile); - } - } - - ~ScopedPrematureExitFile() { - if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') { - remove(premature_exit_filepath_); - } - } - - private: - const char* const premature_exit_filepath_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile); -}; - -} // namespace internal - -// class TestEventListeners - -TestEventListeners::TestEventListeners() - : repeater_(new internal::TestEventRepeater()), - default_result_printer_(NULL), - default_xml_generator_(NULL) { -} - -TestEventListeners::~TestEventListeners() { delete repeater_; } - -// Returns the standard listener responsible for the default console -// output. Can be removed from the listeners list to shut down default -// console output. Note that removing this object from the listener list -// with Release transfers its ownership to the user. -void TestEventListeners::Append(TestEventListener* listener) { - repeater_->Append(listener); -} - -// Removes the given event listener from the list and returns it. It then -// becomes the caller's responsibility to delete the listener. Returns -// NULL if the listener is not found in the list. -TestEventListener* TestEventListeners::Release(TestEventListener* listener) { - if (listener == default_result_printer_) - default_result_printer_ = NULL; - else if (listener == default_xml_generator_) - default_xml_generator_ = NULL; - return repeater_->Release(listener); -} - -// Returns repeater that broadcasts the TestEventListener events to all -// subscribers. -TestEventListener* TestEventListeners::repeater() { return repeater_; } - -// Sets the default_result_printer attribute to the provided listener. -// The listener is also added to the listener list and previous -// default_result_printer is removed from it and deleted. The listener can -// also be NULL in which case it will not be added to the list. Does -// nothing if the previous and the current listener objects are the same. -void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) { - if (default_result_printer_ != listener) { - // It is an error to pass this method a listener that is already in the - // list. - delete Release(default_result_printer_); - default_result_printer_ = listener; - if (listener != NULL) - Append(listener); - } -} - -// Sets the default_xml_generator attribute to the provided listener. The -// listener is also added to the listener list and previous -// default_xml_generator is removed from it and deleted. The listener can -// also be NULL in which case it will not be added to the list. Does -// nothing if the previous and the current listener objects are the same. -void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) { - if (default_xml_generator_ != listener) { - // It is an error to pass this method a listener that is already in the - // list. - delete Release(default_xml_generator_); - default_xml_generator_ = listener; - if (listener != NULL) - Append(listener); - } -} - -// Controls whether events will be forwarded by the repeater to the -// listeners in the list. -bool TestEventListeners::EventForwardingEnabled() const { - return repeater_->forwarding_enabled(); -} - -void TestEventListeners::SuppressEventForwarding() { - repeater_->set_forwarding_enabled(false); -} - -// class UnitTest - -// Gets the singleton UnitTest object. The first time this method is -// called, a UnitTest object is constructed and returned. Consecutive -// calls will return the same object. -// -// We don't protect this under mutex_ as a user is not supposed to -// call this before main() starts, from which point on the return -// value will never change. -UnitTest* UnitTest::GetInstance() { - // When compiled with MSVC 7.1 in optimized mode, destroying the - // UnitTest object upon exiting the program messes up the exit code, - // causing successful tests to appear failed. We have to use a - // different implementation in this case to bypass the compiler bug. - // This implementation makes the compiler happy, at the cost of - // leaking the UnitTest object. - - // CodeGear C++Builder insists on a public destructor for the - // default implementation. Use this implementation to keep good OO - // design with private destructor. - -#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) - static UnitTest* const instance = new UnitTest; - return instance; -#else - static UnitTest instance; - return &instance; -#endif // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) -} - -// Gets the number of successful test cases. -int UnitTest::successful_test_case_count() const { - return impl()->successful_test_case_count(); -} - -// Gets the number of failed test cases. -int UnitTest::failed_test_case_count() const { - return impl()->failed_test_case_count(); -} - -// Gets the number of all test cases. -int UnitTest::total_test_case_count() const { - return impl()->total_test_case_count(); -} - -// Gets the number of all test cases that contain at least one test -// that should run. -int UnitTest::test_case_to_run_count() const { - return impl()->test_case_to_run_count(); -} - -// Gets the number of successful tests. -int UnitTest::successful_test_count() const { - return impl()->successful_test_count(); -} - -// Gets the number of failed tests. -int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } - -// Gets the number of disabled tests that will be reported in the XML report. -int UnitTest::reportable_disabled_test_count() const { - return impl()->reportable_disabled_test_count(); -} - -// Gets the number of disabled tests. -int UnitTest::disabled_test_count() const { - return impl()->disabled_test_count(); -} - -// Gets the number of tests to be printed in the XML report. -int UnitTest::reportable_test_count() const { - return impl()->reportable_test_count(); -} - -// Gets the number of all tests. -int UnitTest::total_test_count() const { return impl()->total_test_count(); } - -// Gets the number of tests that should run. -int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } - -// Gets the time of the test program start, in ms from the start of the -// UNIX epoch. -internal::TimeInMillis UnitTest::start_timestamp() const { - return impl()->start_timestamp(); -} - -// Gets the elapsed time, in milliseconds. -internal::TimeInMillis UnitTest::elapsed_time() const { - return impl()->elapsed_time(); -} - -// Returns true iff the unit test passed (i.e. all test cases passed). -bool UnitTest::Passed() const { return impl()->Passed(); } - -// Returns true iff the unit test failed (i.e. some test case failed -// or something outside of all tests failed). -bool UnitTest::Failed() const { return impl()->Failed(); } - -// Gets the i-th test case among all the test cases. i can range from 0 to -// total_test_case_count() - 1. If i is not in that range, returns NULL. -const TestCase* UnitTest::GetTestCase(int i) const { - return impl()->GetTestCase(i); -} - -// Returns the TestResult containing information on test failures and -// properties logged outside of individual test cases. -const TestResult& UnitTest::ad_hoc_test_result() const { - return *impl()->ad_hoc_test_result(); -} - -// Gets the i-th test case among all the test cases. i can range from 0 to -// total_test_case_count() - 1. If i is not in that range, returns NULL. -TestCase* UnitTest::GetMutableTestCase(int i) { - return impl()->GetMutableTestCase(i); -} - -// Returns the list of event listeners that can be used to track events -// inside Google Test. -TestEventListeners& UnitTest::listeners() { - return *impl()->listeners(); -} - -// Registers and returns a global test environment. When a test -// program is run, all global test environments will be set-up in the -// order they were registered. After all tests in the program have -// finished, all global test environments will be torn-down in the -// *reverse* order they were registered. -// -// The UnitTest object takes ownership of the given environment. -// -// We don't protect this under mutex_, as we only support calling it -// from the main thread. -Environment* UnitTest::AddEnvironment(Environment* env) { - if (env == NULL) { - return NULL; - } - - impl_->environments().push_back(env); - return env; -} - -// Adds a TestPartResult to the current TestResult object. All Google Test -// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call -// this to report their results. The user code should use the -// assertion macros instead of calling this directly. -void UnitTest::AddTestPartResult( - TestPartResult::Type result_type, - const char* file_name, - int line_number, - const std::string& message, - const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) { - Message msg; - msg << message; - - internal::MutexLock lock(&mutex_); - if (impl_->gtest_trace_stack().size() > 0) { - msg << "\n" << GTEST_NAME_ << " trace:"; - - for (int i = static_cast(impl_->gtest_trace_stack().size()); - i > 0; --i) { - const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; - msg << "\n" << internal::FormatFileLocation(trace.file, trace.line) - << " " << trace.message; - } - } - - if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) { - msg << internal::kStackTraceMarker << os_stack_trace; - } - - const TestPartResult result = - TestPartResult(result_type, file_name, line_number, - msg.GetString().c_str()); - impl_->GetTestPartResultReporterForCurrentThread()-> - ReportTestPartResult(result); - - if (result_type != TestPartResult::kSuccess) { - // gtest_break_on_failure takes precedence over - // gtest_throw_on_failure. This allows a user to set the latter - // in the code (perhaps in order to use Google Test assertions - // with another testing framework) and specify the former on the - // command line for debugging. - if (GTEST_FLAG(break_on_failure)) { -#if GTEST_OS_WINDOWS - // Using DebugBreak on Windows allows gtest to still break into a debugger - // when a failure happens and both the --gtest_break_on_failure and - // the --gtest_catch_exceptions flags are specified. - DebugBreak(); -#else - // Dereference NULL through a volatile pointer to prevent the compiler - // from removing. We use this rather than abort() or __builtin_trap() for - // portability: Symbian doesn't implement abort() well, and some debuggers - // don't correctly trap abort(). - *static_cast(NULL) = 1; -#endif // GTEST_OS_WINDOWS - } else if (GTEST_FLAG(throw_on_failure)) { -#if GTEST_HAS_EXCEPTIONS - throw internal::GoogleTestFailureException(result); -#else - // We cannot call abort() as it generates a pop-up in debug mode - // that cannot be suppressed in VC 7.1 or below. - exit(1); -#endif - } - } -} - -// Adds a TestProperty to the current TestResult object when invoked from -// inside a test, to current TestCase's ad_hoc_test_result_ when invoked -// from SetUpTestCase or TearDownTestCase, or to the global property set -// when invoked elsewhere. If the result already contains a property with -// the same key, the value will be updated. -void UnitTest::RecordProperty(const std::string& key, - const std::string& value) { - impl_->RecordProperty(TestProperty(key, value)); -} - -// Runs all tests in this UnitTest object and prints the result. -// Returns 0 if successful, or 1 otherwise. -// -// We don't protect this under mutex_, as we only support calling it -// from the main thread. -int UnitTest::Run() { - const bool in_death_test_child_process = - internal::GTEST_FLAG(internal_run_death_test).length() > 0; - - // Google Test implements this protocol for catching that a test - // program exits before returning control to Google Test: - // - // 1. Upon start, Google Test creates a file whose absolute path - // is specified by the environment variable - // TEST_PREMATURE_EXIT_FILE. - // 2. When Google Test has finished its work, it deletes the file. - // - // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before - // running a Google-Test-based test program and check the existence - // of the file at the end of the test execution to see if it has - // exited prematurely. - - // If we are in the child process of a death test, don't - // create/delete the premature exit file, as doing so is unnecessary - // and will confuse the parent process. Otherwise, create/delete - // the file upon entering/leaving this function. If the program - // somehow exits before this function has a chance to return, the - // premature-exit file will be left undeleted, causing a test runner - // that understands the premature-exit-file protocol to report the - // test as having failed. - const internal::ScopedPrematureExitFile premature_exit_file( - in_death_test_child_process ? - NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); - - // Captures the value of GTEST_FLAG(catch_exceptions). This value will be - // used for the duration of the program. - impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); - -#if GTEST_HAS_SEH - // Either the user wants Google Test to catch exceptions thrown by the - // tests or this is executing in the context of death test child - // process. In either case the user does not want to see pop-up dialogs - // about crashes - they are expected. - if (impl()->catch_exceptions() || in_death_test_child_process) { -# if !GTEST_OS_WINDOWS_MOBILE - // SetErrorMode doesn't exist on CE. - SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | - SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); -# endif // !GTEST_OS_WINDOWS_MOBILE - -# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE - // Death test children can be terminated with _abort(). On Windows, - // _abort() can show a dialog with a warning message. This forces the - // abort message to go to stderr instead. - _set_error_mode(_OUT_TO_STDERR); -# endif - -# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE - // In the debug version, Visual Studio pops up a separate dialog - // offering a choice to debug the aborted program. We need to suppress - // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement - // executed. Google Test will notify the user of any unexpected - // failure via stderr. - // - // VC++ doesn't define _set_abort_behavior() prior to the version 8.0. - // Users of prior VC versions shall suffer the agony and pain of - // clicking through the countless debug dialogs. - // TODO(vladl@google.com): find a way to suppress the abort dialog() in the - // debug mode when compiled with VC 7.1 or lower. - if (!GTEST_FLAG(break_on_failure)) - _set_abort_behavior( - 0x0, // Clear the following flags: - _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. -# endif - } -#endif // GTEST_HAS_SEH - - return internal::HandleExceptionsInMethodIfSupported( - impl(), - &internal::UnitTestImpl::RunAllTests, - "auxiliary test code (environments or event listeners)") ? 0 : 1; -} - -// Returns the working directory when the first TEST() or TEST_F() was -// executed. -const char* UnitTest::original_working_dir() const { - return impl_->original_working_dir_.c_str(); -} - -// Returns the TestCase object for the test that's currently running, -// or NULL if no test is running. -const TestCase* UnitTest::current_test_case() const - GTEST_LOCK_EXCLUDED_(mutex_) { - internal::MutexLock lock(&mutex_); - return impl_->current_test_case(); -} - -// Returns the TestInfo object for the test that's currently running, -// or NULL if no test is running. -const TestInfo* UnitTest::current_test_info() const - GTEST_LOCK_EXCLUDED_(mutex_) { - internal::MutexLock lock(&mutex_); - return impl_->current_test_info(); -} - -// Returns the random seed used at the start of the current test run. -int UnitTest::random_seed() const { return impl_->random_seed(); } - -#if GTEST_HAS_PARAM_TEST -// Returns ParameterizedTestCaseRegistry object used to keep track of -// value-parameterized tests and instantiate and register them. -internal::ParameterizedTestCaseRegistry& - UnitTest::parameterized_test_registry() - GTEST_LOCK_EXCLUDED_(mutex_) { - return impl_->parameterized_test_registry(); -} -#endif // GTEST_HAS_PARAM_TEST - -// Creates an empty UnitTest. -UnitTest::UnitTest() { - impl_ = new internal::UnitTestImpl(this); -} - -// Destructor of UnitTest. -UnitTest::~UnitTest() { - delete impl_; -} - -// Pushes a trace defined by SCOPED_TRACE() on to the per-thread -// Google Test trace stack. -void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) - GTEST_LOCK_EXCLUDED_(mutex_) { - internal::MutexLock lock(&mutex_); - impl_->gtest_trace_stack().push_back(trace); -} - -// Pops a trace from the per-thread Google Test trace stack. -void UnitTest::PopGTestTrace() - GTEST_LOCK_EXCLUDED_(mutex_) { - internal::MutexLock lock(&mutex_); - impl_->gtest_trace_stack().pop_back(); -} - -namespace internal { - -UnitTestImpl::UnitTestImpl(UnitTest* parent) - : parent_(parent), -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4355) // Temporarily disables warning 4355 - // (using this in initializer). - default_global_test_part_result_reporter_(this), - default_per_thread_test_part_result_reporter_(this), -# pragma warning(pop) // Restores the warning state again. -#else - default_global_test_part_result_reporter_(this), - default_per_thread_test_part_result_reporter_(this), -#endif // _MSC_VER - global_test_part_result_repoter_( - &default_global_test_part_result_reporter_), - per_thread_test_part_result_reporter_( - &default_per_thread_test_part_result_reporter_), -#if GTEST_HAS_PARAM_TEST - parameterized_test_registry_(), - parameterized_tests_registered_(false), -#endif // GTEST_HAS_PARAM_TEST - last_death_test_case_(-1), - current_test_case_(NULL), - current_test_info_(NULL), - ad_hoc_test_result_(), - os_stack_trace_getter_(NULL), - post_flag_parse_init_performed_(false), - random_seed_(0), // Will be overridden by the flag before first use. - random_(0), // Will be reseeded before first use. - start_timestamp_(0), - elapsed_time_(0), -#if GTEST_HAS_DEATH_TEST - death_test_factory_(new DefaultDeathTestFactory), -#endif - // Will be overridden by the flag before first use. - catch_exceptions_(false) { - listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter); -} - -UnitTestImpl::~UnitTestImpl() { - // Deletes every TestCase. - ForEach(test_cases_, internal::Delete); - - // Deletes every Environment. - ForEach(environments_, internal::Delete); - - delete os_stack_trace_getter_; -} - -// Adds a TestProperty to the current TestResult object when invoked in a -// context of a test, to current test case's ad_hoc_test_result when invoke -// from SetUpTestCase/TearDownTestCase, or to the global property set -// otherwise. If the result already contains a property with the same key, -// the value will be updated. -void UnitTestImpl::RecordProperty(const TestProperty& test_property) { - std::string xml_element; - TestResult* test_result; // TestResult appropriate for property recording. - - if (current_test_info_ != NULL) { - xml_element = "testcase"; - test_result = &(current_test_info_->result_); - } else if (current_test_case_ != NULL) { - xml_element = "testsuite"; - test_result = &(current_test_case_->ad_hoc_test_result_); - } else { - xml_element = "testsuites"; - test_result = &ad_hoc_test_result_; - } - test_result->RecordProperty(xml_element, test_property); -} - -#if GTEST_HAS_DEATH_TEST -// Disables event forwarding if the control is currently in a death test -// subprocess. Must not be called before InitGoogleTest. -void UnitTestImpl::SuppressTestEventsIfInSubprocess() { - if (internal_run_death_test_flag_.get() != NULL) - listeners()->SuppressEventForwarding(); -} -#endif // GTEST_HAS_DEATH_TEST - -// Initializes event listeners performing XML output as specified by -// UnitTestOptions. Must not be called before InitGoogleTest. -void UnitTestImpl::ConfigureXmlOutput() { - const std::string& output_format = UnitTestOptions::GetOutputFormat(); - if (output_format == "xml") { - listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( - UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); - } else if (output_format != "") { - printf("WARNING: unrecognized output format \"%s\" ignored.\n", - output_format.c_str()); - fflush(stdout); - } -} - -#if GTEST_CAN_STREAM_RESULTS_ -// Initializes event listeners for streaming test results in string form. -// Must not be called before InitGoogleTest. -void UnitTestImpl::ConfigureStreamingOutput() { - const std::string& target = GTEST_FLAG(stream_result_to); - if (!target.empty()) { - const size_t pos = target.find(':'); - if (pos != std::string::npos) { - listeners()->Append(new StreamingListener(target.substr(0, pos), - target.substr(pos+1))); - } else { - printf("WARNING: unrecognized streaming target \"%s\" ignored.\n", - target.c_str()); - fflush(stdout); - } - } -} -#endif // GTEST_CAN_STREAM_RESULTS_ - -// Performs initialization dependent upon flag values obtained in -// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to -// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest -// this function is also called from RunAllTests. Since this function can be -// called more than once, it has to be idempotent. -void UnitTestImpl::PostFlagParsingInit() { - // Ensures that this function does not execute more than once. - if (!post_flag_parse_init_performed_) { - post_flag_parse_init_performed_ = true; - -#if GTEST_HAS_DEATH_TEST - InitDeathTestSubprocessControlInfo(); - SuppressTestEventsIfInSubprocess(); -#endif // GTEST_HAS_DEATH_TEST - - // Registers parameterized tests. This makes parameterized tests - // available to the UnitTest reflection API without running - // RUN_ALL_TESTS. - RegisterParameterizedTests(); - - // Configures listeners for XML output. This makes it possible for users - // to shut down the default XML output before invoking RUN_ALL_TESTS. - ConfigureXmlOutput(); - -#if GTEST_CAN_STREAM_RESULTS_ - // Configures listeners for streaming test results to the specified server. - ConfigureStreamingOutput(); -#endif // GTEST_CAN_STREAM_RESULTS_ - } -} - -// A predicate that checks the name of a TestCase against a known -// value. -// -// This is used for implementation of the UnitTest class only. We put -// it in the anonymous namespace to prevent polluting the outer -// namespace. -// -// TestCaseNameIs is copyable. -class TestCaseNameIs { - public: - // Constructor. - explicit TestCaseNameIs(const std::string& name) - : name_(name) {} - - // Returns true iff the name of test_case matches name_. - bool operator()(const TestCase* test_case) const { - return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0; - } - - private: - std::string name_; -}; - -// Finds and returns a TestCase with the given name. If one doesn't -// exist, creates one and returns it. It's the CALLER'S -// RESPONSIBILITY to ensure that this function is only called WHEN THE -// TESTS ARE NOT SHUFFLED. -// -// Arguments: -// -// test_case_name: name of the test case -// type_param: the name of the test case's type parameter, or NULL if -// this is not a typed or a type-parameterized test case. -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -TestCase* UnitTestImpl::GetTestCase(const char* test_case_name, - const char* type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc) { - // Can we find a TestCase with the given name? - const std::vector::const_iterator test_case = - std::find_if(test_cases_.begin(), test_cases_.end(), - TestCaseNameIs(test_case_name)); - - if (test_case != test_cases_.end()) - return *test_case; - - // No. Let's create one. - TestCase* const new_test_case = - new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc); - - // Is this a death test case? - if (internal::UnitTestOptions::MatchesFilter(test_case_name, - kDeathTestCaseFilter)) { - // Yes. Inserts the test case after the last death test case - // defined so far. This only works when the test cases haven't - // been shuffled. Otherwise we may end up running a death test - // after a non-death test. - ++last_death_test_case_; - test_cases_.insert(test_cases_.begin() + last_death_test_case_, - new_test_case); - } else { - // No. Appends to the end of the list. - test_cases_.push_back(new_test_case); - } - - test_case_indices_.push_back(static_cast(test_case_indices_.size())); - return new_test_case; -} - -// Helpers for setting up / tearing down the given environment. They -// are for use in the ForEach() function. -static void SetUpEnvironment(Environment* env) { env->SetUp(); } -static void TearDownEnvironment(Environment* env) { env->TearDown(); } - -// Runs all tests in this UnitTest object, prints the result, and -// returns true if all tests are successful. If any exception is -// thrown during a test, the test is considered to be failed, but the -// rest of the tests will still be run. -// -// When parameterized tests are enabled, it expands and registers -// parameterized tests first in RegisterParameterizedTests(). -// All other functions called from RunAllTests() may safely assume that -// parameterized tests are ready to be counted and run. -bool UnitTestImpl::RunAllTests() { - // Makes sure InitGoogleTest() was called. - if (!GTestIsInitialized()) { - printf("%s", - "\nThis test program did NOT call ::testing::InitGoogleTest " - "before calling RUN_ALL_TESTS(). Please fix it.\n"); - return false; - } - - // Do not run any test if the --help flag was specified. - if (g_help_flag) - return true; - - // Repeats the call to the post-flag parsing initialization in case the - // user didn't call InitGoogleTest. - PostFlagParsingInit(); - - // Even if sharding is not on, test runners may want to use the - // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding - // protocol. - internal::WriteToShardStatusFileIfNeeded(); - - // True iff we are in a subprocess for running a thread-safe-style - // death test. - bool in_subprocess_for_death_test = false; - -#if GTEST_HAS_DEATH_TEST - in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL); -#endif // GTEST_HAS_DEATH_TEST - - const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, - in_subprocess_for_death_test); - - // Compares the full test names with the filter to decide which - // tests to run. - const bool has_tests_to_run = FilterTests(should_shard - ? HONOR_SHARDING_PROTOCOL - : IGNORE_SHARDING_PROTOCOL) > 0; - - // Lists the tests and exits if the --gtest_list_tests flag was specified. - if (GTEST_FLAG(list_tests)) { - // This must be called *after* FilterTests() has been called. - ListTestsMatchingFilter(); - return true; - } - - random_seed_ = GTEST_FLAG(shuffle) ? - GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; - - // True iff at least one test has failed. - bool failed = false; - - TestEventListener* repeater = listeners()->repeater(); - - start_timestamp_ = GetTimeInMillis(); - repeater->OnTestProgramStart(*parent_); - - // How many times to repeat the tests? We don't want to repeat them - // when we are inside the subprocess of a death test. - const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); - // Repeats forever if the repeat count is negative. - const bool forever = repeat < 0; - for (int i = 0; forever || i != repeat; i++) { - // We want to preserve failures generated by ad-hoc test - // assertions executed before RUN_ALL_TESTS(). - ClearNonAdHocTestResult(); - - const TimeInMillis start = GetTimeInMillis(); - - // Shuffles test cases and tests if requested. - if (has_tests_to_run && GTEST_FLAG(shuffle)) { - random()->Reseed(random_seed_); - // This should be done before calling OnTestIterationStart(), - // such that a test event listener can see the actual test order - // in the event. - ShuffleTests(); - } - - // Tells the unit test event listeners that the tests are about to start. - repeater->OnTestIterationStart(*parent_, i); - - // Runs each test case if there is at least one test to run. - if (has_tests_to_run) { - // Sets up all environments beforehand. - repeater->OnEnvironmentsSetUpStart(*parent_); - ForEach(environments_, SetUpEnvironment); - repeater->OnEnvironmentsSetUpEnd(*parent_); - - // Runs the tests only if there was no fatal failure during global - // set-up. - if (!Test::HasFatalFailure()) { - for (int test_index = 0; test_index < total_test_case_count(); - test_index++) { - GetMutableTestCase(test_index)->Run(); - } - } - - // Tears down all environments in reverse order afterwards. - repeater->OnEnvironmentsTearDownStart(*parent_); - std::for_each(environments_.rbegin(), environments_.rend(), - TearDownEnvironment); - repeater->OnEnvironmentsTearDownEnd(*parent_); - } - - elapsed_time_ = GetTimeInMillis() - start; - - // Tells the unit test event listener that the tests have just finished. - repeater->OnTestIterationEnd(*parent_, i); - - // Gets the result and clears it. - if (!Passed()) { - failed = true; - } - - // Restores the original test order after the iteration. This - // allows the user to quickly repro a failure that happens in the - // N-th iteration without repeating the first (N - 1) iterations. - // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in - // case the user somehow changes the value of the flag somewhere - // (it's always safe to unshuffle the tests). - UnshuffleTests(); - - if (GTEST_FLAG(shuffle)) { - // Picks a new random seed for each iteration. - random_seed_ = GetNextRandomSeed(random_seed_); - } - } - - repeater->OnTestProgramEnd(*parent_); - - return !failed; -} - -// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file -// if the variable is present. If a file already exists at this location, this -// function will write over it. If the variable is present, but the file cannot -// be created, prints an error and exits. -void WriteToShardStatusFileIfNeeded() { - const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile); - if (test_shard_file != NULL) { - FILE* const file = posix::FOpen(test_shard_file, "w"); - if (file == NULL) { - ColoredPrintf(COLOR_RED, - "Could not write to the test shard status file \"%s\" " - "specified by the %s environment variable.\n", - test_shard_file, kTestShardStatusFile); - fflush(stdout); - exit(EXIT_FAILURE); - } - fclose(file); - } -} - -// Checks whether sharding is enabled by examining the relevant -// environment variable values. If the variables are present, -// but inconsistent (i.e., shard_index >= total_shards), prints -// an error and exits. If in_subprocess_for_death_test, sharding is -// disabled because it must only be applied to the original test -// process. Otherwise, we could filter out death tests we intended to execute. -bool ShouldShard(const char* total_shards_env, - const char* shard_index_env, - bool in_subprocess_for_death_test) { - if (in_subprocess_for_death_test) { - return false; - } - - const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1); - const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1); - - if (total_shards == -1 && shard_index == -1) { - return false; - } else if (total_shards == -1 && shard_index != -1) { - const Message msg = Message() - << "Invalid environment variables: you have " - << kTestShardIndex << " = " << shard_index - << ", but have left " << kTestTotalShards << " unset.\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); - fflush(stdout); - exit(EXIT_FAILURE); - } else if (total_shards != -1 && shard_index == -1) { - const Message msg = Message() - << "Invalid environment variables: you have " - << kTestTotalShards << " = " << total_shards - << ", but have left " << kTestShardIndex << " unset.\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); - fflush(stdout); - exit(EXIT_FAILURE); - } else if (shard_index < 0 || shard_index >= total_shards) { - const Message msg = Message() - << "Invalid environment variables: we require 0 <= " - << kTestShardIndex << " < " << kTestTotalShards - << ", but you have " << kTestShardIndex << "=" << shard_index - << ", " << kTestTotalShards << "=" << total_shards << ".\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); - fflush(stdout); - exit(EXIT_FAILURE); - } - - return total_shards > 1; -} - -// Parses the environment variable var as an Int32. If it is unset, -// returns default_val. If it is not an Int32, prints an error -// and aborts. -Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) { - const char* str_val = posix::GetEnv(var); - if (str_val == NULL) { - return default_val; - } - - Int32 result; - if (!ParseInt32(Message() << "The value of environment variable " << var, - str_val, &result)) { - exit(EXIT_FAILURE); - } - return result; -} - -// Given the total number of shards, the shard index, and the test id, -// returns true iff the test should be run on this shard. The test id is -// some arbitrary but unique non-negative integer assigned to each test -// method. Assumes that 0 <= shard_index < total_shards. -bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { - return (test_id % total_shards) == shard_index; -} - -// Compares the name of each test with the user-specified filter to -// decide whether the test should be run, then records the result in -// each TestCase and TestInfo object. -// If shard_tests == true, further filters tests based on sharding -// variables in the environment - see -// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide. -// Returns the number of tests that should run. -int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { - const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? - Int32FromEnvOrDie(kTestTotalShards, -1) : -1; - const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? - Int32FromEnvOrDie(kTestShardIndex, -1) : -1; - - // num_runnable_tests are the number of tests that will - // run across all shards (i.e., match filter and are not disabled). - // num_selected_tests are the number of tests to be run on - // this shard. - int num_runnable_tests = 0; - int num_selected_tests = 0; - for (size_t i = 0; i < test_cases_.size(); i++) { - TestCase* const test_case = test_cases_[i]; - const std::string &test_case_name = test_case->name(); - test_case->set_should_run(false); - - for (size_t j = 0; j < test_case->test_info_list().size(); j++) { - TestInfo* const test_info = test_case->test_info_list()[j]; - const std::string test_name(test_info->name()); - // A test is disabled if test case name or test name matches - // kDisableTestFilter. - const bool is_disabled = - internal::UnitTestOptions::MatchesFilter(test_case_name, - kDisableTestFilter) || - internal::UnitTestOptions::MatchesFilter(test_name, - kDisableTestFilter); - test_info->is_disabled_ = is_disabled; - - const bool matches_filter = - internal::UnitTestOptions::FilterMatchesTest(test_case_name, - test_name); - test_info->matches_filter_ = matches_filter; - - const bool is_runnable = - (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && - matches_filter; - - const bool is_selected = is_runnable && - (shard_tests == IGNORE_SHARDING_PROTOCOL || - ShouldRunTestOnShard(total_shards, shard_index, - num_runnable_tests)); - - num_runnable_tests += is_runnable; - num_selected_tests += is_selected; - - test_info->should_run_ = is_selected; - test_case->set_should_run(test_case->should_run() || is_selected); - } - } - return num_selected_tests; -} - -// Prints the given C-string on a single line by replacing all '\n' -// characters with string "\\n". If the output takes more than -// max_length characters, only prints the first max_length characters -// and "...". -static void PrintOnOneLine(const char* str, int max_length) { - if (str != NULL) { - for (int i = 0; *str != '\0'; ++str) { - if (i >= max_length) { - printf("..."); - break; - } - if (*str == '\n') { - printf("\\n"); - i += 2; - } else { - printf("%c", *str); - ++i; - } - } - } -} - -// Prints the names of the tests matching the user-specified filter flag. -void UnitTestImpl::ListTestsMatchingFilter() { - // Print at most this many characters for each type/value parameter. - const int kMaxParamLength = 250; - - for (size_t i = 0; i < test_cases_.size(); i++) { - const TestCase* const test_case = test_cases_[i]; - bool printed_test_case_name = false; - - for (size_t j = 0; j < test_case->test_info_list().size(); j++) { - const TestInfo* const test_info = - test_case->test_info_list()[j]; - if (test_info->matches_filter_) { - if (!printed_test_case_name) { - printed_test_case_name = true; - printf("%s.", test_case->name()); - if (test_case->type_param() != NULL) { - printf(" # %s = ", kTypeParamLabel); - // We print the type parameter on a single line to make - // the output easy to parse by a program. - PrintOnOneLine(test_case->type_param(), kMaxParamLength); - } - printf("\n"); - } - printf(" %s", test_info->name()); - if (test_info->value_param() != NULL) { - printf(" # %s = ", kValueParamLabel); - // We print the value parameter on a single line to make the - // output easy to parse by a program. - PrintOnOneLine(test_info->value_param(), kMaxParamLength); - } - printf("\n"); - } - } - } - fflush(stdout); -} - -// Sets the OS stack trace getter. -// -// Does nothing if the input and the current OS stack trace getter are -// the same; otherwise, deletes the old getter and makes the input the -// current getter. -void UnitTestImpl::set_os_stack_trace_getter( - OsStackTraceGetterInterface* getter) { - if (os_stack_trace_getter_ != getter) { - delete os_stack_trace_getter_; - os_stack_trace_getter_ = getter; - } -} - -// Returns the current OS stack trace getter if it is not NULL; -// otherwise, creates an OsStackTraceGetter, makes it the current -// getter, and returns it. -OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { - if (os_stack_trace_getter_ == NULL) { - os_stack_trace_getter_ = new OsStackTraceGetter; - } - - return os_stack_trace_getter_; -} - -// Returns the TestResult for the test that's currently running, or -// the TestResult for the ad hoc test if no test is running. -TestResult* UnitTestImpl::current_test_result() { - return current_test_info_ ? - &(current_test_info_->result_) : &ad_hoc_test_result_; -} - -// Shuffles all test cases, and the tests within each test case, -// making sure that death tests are still run first. -void UnitTestImpl::ShuffleTests() { - // Shuffles the death test cases. - ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_); - - // Shuffles the non-death test cases. - ShuffleRange(random(), last_death_test_case_ + 1, - static_cast(test_cases_.size()), &test_case_indices_); - - // Shuffles the tests inside each test case. - for (size_t i = 0; i < test_cases_.size(); i++) { - test_cases_[i]->ShuffleTests(random()); - } -} - -// Restores the test cases and tests to their order before the first shuffle. -void UnitTestImpl::UnshuffleTests() { - for (size_t i = 0; i < test_cases_.size(); i++) { - // Unshuffles the tests in each test case. - test_cases_[i]->UnshuffleTests(); - // Resets the index of each test case. - test_case_indices_[i] = static_cast(i); - } -} - -// Returns the current OS stack trace as an std::string. -// -// The maximum number of stack frames to be included is specified by -// the gtest_stack_trace_depth flag. The skip_count parameter -// specifies the number of top frames to be skipped, which doesn't -// count against the number of frames to be included. -// -// For example, if Foo() calls Bar(), which in turn calls -// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in -// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. -std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, - int skip_count) { - // We pass skip_count + 1 to skip this wrapper function in addition - // to what the user really wants to skip. - return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); -} - -// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to -// suppress unreachable code warnings. -namespace { -class ClassUniqueToAlwaysTrue {}; -} - -bool IsTrue(bool condition) { return condition; } - -bool AlwaysTrue() { -#if GTEST_HAS_EXCEPTIONS - // This condition is always false so AlwaysTrue() never actually throws, - // but it makes the compiler think that it may throw. - if (IsTrue(false)) - throw ClassUniqueToAlwaysTrue(); -#endif // GTEST_HAS_EXCEPTIONS - return true; -} - -// If *pstr starts with the given prefix, modifies *pstr to be right -// past the prefix and returns true; otherwise leaves *pstr unchanged -// and returns false. None of pstr, *pstr, and prefix can be NULL. -bool SkipPrefix(const char* prefix, const char** pstr) { - const size_t prefix_len = strlen(prefix); - if (strncmp(*pstr, prefix, prefix_len) == 0) { - *pstr += prefix_len; - return true; - } - return false; -} - -// Parses a string as a command line flag. The string should have -// the format "--flag=value". When def_optional is true, the "=value" -// part can be omitted. -// -// Returns the value of the flag, or NULL if the parsing failed. -const char* ParseFlagValue(const char* str, - const char* flag, - bool def_optional) { - // str and flag must not be NULL. - if (str == NULL || flag == NULL) return NULL; - - // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. - const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag; - const size_t flag_len = flag_str.length(); - if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL; - - // Skips the flag name. - const char* flag_end = str + flag_len; - - // When def_optional is true, it's OK to not have a "=value" part. - if (def_optional && (flag_end[0] == '\0')) { - return flag_end; - } - - // If def_optional is true and there are more characters after the - // flag name, or if def_optional is false, there must be a '=' after - // the flag name. - if (flag_end[0] != '=') return NULL; - - // Returns the string after "=". - return flag_end + 1; -} - -// Parses a string for a bool flag, in the form of either -// "--flag=value" or "--flag". -// -// In the former case, the value is taken as true as long as it does -// not start with '0', 'f', or 'F'. -// -// In the latter case, the value is taken as true. -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseBoolFlag(const char* str, const char* flag, bool* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, true); - - // Aborts if the parsing failed. - if (value_str == NULL) return false; - - // Converts the string value to a bool. - *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); - return true; -} - -// Parses a string for an Int32 flag, in the form of -// "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); - - // Aborts if the parsing failed. - if (value_str == NULL) return false; - - // Sets *value to the value of the flag. - return ParseInt32(Message() << "The value of flag --" << flag, - value_str, value); -} - -// Parses a string for a string flag, in the form of -// "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseStringFlag(const char* str, const char* flag, std::string* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); - - // Aborts if the parsing failed. - if (value_str == NULL) return false; - - // Sets *value to the value of the flag. - *value = value_str; - return true; -} - -// Determines whether a string has a prefix that Google Test uses for its -// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_. -// If Google Test detects that a command line flag has its prefix but is not -// recognized, it will print its help message. Flags starting with -// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test -// internal flags and do not trigger the help message. -static bool HasGoogleTestFlagPrefix(const char* str) { - return (SkipPrefix("--", &str) || - SkipPrefix("-", &str) || - SkipPrefix("/", &str)) && - !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && - (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || - SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str)); -} - -// Prints a string containing code-encoded text. The following escape -// sequences can be used in the string to control the text color: -// -// @@ prints a single '@' character. -// @R changes the color to red. -// @G changes the color to green. -// @Y changes the color to yellow. -// @D changes to the default terminal text color. -// -// TODO(wan@google.com): Write tests for this once we add stdout -// capturing to Google Test. -static void PrintColorEncoded(const char* str) { - GTestColor color = COLOR_DEFAULT; // The current color. - - // Conceptually, we split the string into segments divided by escape - // sequences. Then we print one segment at a time. At the end of - // each iteration, the str pointer advances to the beginning of the - // next segment. - for (;;) { - const char* p = strchr(str, '@'); - if (p == NULL) { - ColoredPrintf(color, "%s", str); - return; - } - - ColoredPrintf(color, "%s", std::string(str, p).c_str()); - - const char ch = p[1]; - str = p + 2; - if (ch == '@') { - ColoredPrintf(color, "@"); - } else if (ch == 'D') { - color = COLOR_DEFAULT; - } else if (ch == 'R') { - color = COLOR_RED; - } else if (ch == 'G') { - color = COLOR_GREEN; - } else if (ch == 'Y') { - color = COLOR_YELLOW; - } else { - --str; - } - } -} - -static const char kColorEncodedHelpMessage[] = -"This program contains tests written using " GTEST_NAME_ ". You can use the\n" -"following command line flags to control its behavior:\n" -"\n" -"Test Selection:\n" -" @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n" -" List the names of all tests instead of running them. The name of\n" -" TEST(Foo, Bar) is \"Foo.Bar\".\n" -" @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS" - "[@G-@YNEGATIVE_PATTERNS]@D\n" -" Run only the tests whose name matches one of the positive patterns but\n" -" none of the negative patterns. '?' matches any single character; '*'\n" -" matches any substring; ':' separates two patterns.\n" -" @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n" -" Run all disabled tests too.\n" -"\n" -"Test Execution:\n" -" @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n" -" Run the tests repeatedly; use a negative count to repeat forever.\n" -" @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n" -" Randomize tests' orders on every iteration.\n" -" @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n" -" Random number seed to use for shuffling test orders (between 1 and\n" -" 99999, or 0 to use a seed based on the current time).\n" -"\n" -"Test Output:\n" -" @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" -" Enable/disable colored output. The default is @Gauto@D.\n" -" -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n" -" Don't print the elapsed time of each test.\n" -" @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G" - GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n" -" Generate an XML report in the given directory or with the given file\n" -" name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n" -#if GTEST_CAN_STREAM_RESULTS_ -" @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" -" Stream test results to the given server.\n" -#endif // GTEST_CAN_STREAM_RESULTS_ -"\n" -"Assertion Behavior:\n" -#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS -" @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" -" Set the default death test style.\n" -#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS -" @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" -" Turn assertion failures into debugger break-points.\n" -" @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n" -" Turn assertion failures into C++ exceptions.\n" -" @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n" -" Do not report exceptions as test failures. Instead, allow them\n" -" to crash the program or throw a pop-up (on Windows).\n" -"\n" -"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set " - "the corresponding\n" -"environment variable of a flag (all letters in upper-case). For example, to\n" -"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_ - "color=no@D or set\n" -"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n" -"\n" -"For more information, please read the " GTEST_NAME_ " documentation at\n" -"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n" -"(not one in your own code or tests), please report it to\n" -"@G<" GTEST_DEV_EMAIL_ ">@D.\n"; - -// Parses the command line for Google Test flags, without initializing -// other parts of Google Test. The type parameter CharType can be -// instantiated to either char or wchar_t. -template -void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { - for (int i = 1; i < *argc; i++) { - const std::string arg_string = StreamableToString(argv[i]); - const char* const arg = arg_string.c_str(); - - using internal::ParseBoolFlag; - using internal::ParseInt32Flag; - using internal::ParseStringFlag; - - // Do we see a Google Test flag? - if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, - >EST_FLAG(also_run_disabled_tests)) || - ParseBoolFlag(arg, kBreakOnFailureFlag, - >EST_FLAG(break_on_failure)) || - ParseBoolFlag(arg, kCatchExceptionsFlag, - >EST_FLAG(catch_exceptions)) || - ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || - ParseStringFlag(arg, kDeathTestStyleFlag, - >EST_FLAG(death_test_style)) || - ParseBoolFlag(arg, kDeathTestUseFork, - >EST_FLAG(death_test_use_fork)) || - ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || - ParseStringFlag(arg, kInternalRunDeathTestFlag, - >EST_FLAG(internal_run_death_test)) || - ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || - ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || - ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || - ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || - ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || - ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || - ParseInt32Flag(arg, kStackTraceDepthFlag, - >EST_FLAG(stack_trace_depth)) || - ParseStringFlag(arg, kStreamResultToFlag, - >EST_FLAG(stream_result_to)) || - ParseBoolFlag(arg, kThrowOnFailureFlag, - >EST_FLAG(throw_on_failure)) - ) { - // Yes. Shift the remainder of the argv list left by one. Note - // that argv has (*argc + 1) elements, the last one always being - // NULL. The following loop moves the trailing NULL element as - // well. - for (int j = i; j != *argc; j++) { - argv[j] = argv[j + 1]; - } - - // Decrements the argument count. - (*argc)--; - - // We also need to decrement the iterator as we just removed - // an element. - i--; - } else if (arg_string == "--help" || arg_string == "-h" || - arg_string == "-?" || arg_string == "/?" || - HasGoogleTestFlagPrefix(arg)) { - // Both help flag and unrecognized Google Test flags (excluding - // internal ones) trigger help display. - g_help_flag = true; - } - } - - if (g_help_flag) { - // We print the help here instead of in RUN_ALL_TESTS(), as the - // latter may not be called at all if the user is using Google - // Test with another testing framework. - PrintColorEncoded(kColorEncodedHelpMessage); - } -} - -// Parses the command line for Google Test flags, without initializing -// other parts of Google Test. -void ParseGoogleTestFlagsOnly(int* argc, char** argv) { - ParseGoogleTestFlagsOnlyImpl(argc, argv); -} -void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) { - ParseGoogleTestFlagsOnlyImpl(argc, argv); -} - -// The internal implementation of InitGoogleTest(). -// -// The type parameter CharType can be instantiated to either char or -// wchar_t. -template -void InitGoogleTestImpl(int* argc, CharType** argv) { - g_init_gtest_count++; - - // We don't want to run the initialization code twice. - if (g_init_gtest_count != 1) return; - - if (*argc <= 0) return; - - internal::g_executable_path = internal::StreamableToString(argv[0]); - -#if GTEST_HAS_DEATH_TEST - - g_argvs.clear(); - for (int i = 0; i != *argc; i++) { - g_argvs.push_back(StreamableToString(argv[i])); - } - -#endif // GTEST_HAS_DEATH_TEST - - ParseGoogleTestFlagsOnly(argc, argv); - GetUnitTestImpl()->PostFlagParsingInit(); -} - -} // namespace internal - -// Initializes Google Test. This must be called before calling -// RUN_ALL_TESTS(). In particular, it parses a command line for the -// flags that Google Test recognizes. Whenever a Google Test flag is -// seen, it is removed from argv, and *argc is decremented. -// -// No value is returned. Instead, the Google Test flag variables are -// updated. -// -// Calling the function for the second time has no user-visible effect. -void InitGoogleTest(int* argc, char** argv) { - internal::InitGoogleTestImpl(argc, argv); -} - -// This overloaded version can be used in Windows programs compiled in -// UNICODE mode. -void InitGoogleTest(int* argc, wchar_t** argv) { - internal::InitGoogleTestImpl(argc, argv); -} - -} // namespace testing -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev) -// -// This file implements death tests. - - -#if GTEST_HAS_DEATH_TEST - -# if GTEST_OS_MAC -# include -# endif // GTEST_OS_MAC - -# include -# include -# include - -# if GTEST_OS_LINUX -# include -# endif // GTEST_OS_LINUX - -# include - -# if GTEST_OS_WINDOWS -# include -# else -# include -# include -# endif // GTEST_OS_WINDOWS - -# if GTEST_OS_QNX -# include -# endif // GTEST_OS_QNX - -#endif // GTEST_HAS_DEATH_TEST - - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 -#undef GTEST_IMPLEMENTATION_ - -namespace testing { - -// Constants. - -// The default death test style. -static const char kDefaultDeathTestStyle[] = "fast"; - -GTEST_DEFINE_string_( - death_test_style, - internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle), - "Indicates how to run a death test in a forked child process: " - "\"threadsafe\" (child process re-executes the test binary " - "from the beginning, running only the specific death test) or " - "\"fast\" (child process runs the death test immediately " - "after forking)."); - -GTEST_DEFINE_bool_( - death_test_use_fork, - internal::BoolFromGTestEnv("death_test_use_fork", false), - "Instructs to use fork()/_exit() instead of clone() in death tests. " - "Ignored and always uses fork() on POSIX systems where clone() is not " - "implemented. Useful when running under valgrind or similar tools if " - "those do not support clone(). Valgrind 3.3.1 will just fail if " - "it sees an unsupported combination of clone() flags. " - "It is not recommended to use this flag w/o valgrind though it will " - "work in 99% of the cases. Once valgrind is fixed, this flag will " - "most likely be removed."); - -namespace internal { -GTEST_DEFINE_string_( - internal_run_death_test, "", - "Indicates the file, line number, temporal index of " - "the single death test to run, and a file descriptor to " - "which a success code may be sent, all separated by " - "the '|' characters. This flag is specified if and only if the current " - "process is a sub-process launched for running a thread-safe " - "death test. FOR INTERNAL USE ONLY."); -} // namespace internal - -#if GTEST_HAS_DEATH_TEST - -namespace internal { - -# if !GTEST_OS_WINDOWS -// Valid only for fast death tests. Indicates the code is running in the -// child process of a fast style death test. -static bool g_in_fast_death_test_child = false; -# endif // !GTEST_OS_WINDOWS - -// Returns a Boolean value indicating whether the caller is currently -// executing in the context of the death test child process. Tools such as -// Valgrind heap checkers may need this to modify their behavior in death -// tests. IMPORTANT: This is an internal utility. Using it may break the -// implementation of death tests. User code MUST NOT use it. -bool InDeathTestChild() { -# if GTEST_OS_WINDOWS - - // On Windows, death tests are thread-safe regardless of the value of the - // death_test_style flag. - return !GTEST_FLAG(internal_run_death_test).empty(); - -# else - - if (GTEST_FLAG(death_test_style) == "threadsafe") - return !GTEST_FLAG(internal_run_death_test).empty(); - else - return g_in_fast_death_test_child; -#endif -} - -} // namespace internal - -// ExitedWithCode constructor. -ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) { -} - -// ExitedWithCode function-call operator. -bool ExitedWithCode::operator()(int exit_status) const { -# if GTEST_OS_WINDOWS - - return exit_status == exit_code_; - -# else - - return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; - -# endif // GTEST_OS_WINDOWS -} - -# if !GTEST_OS_WINDOWS -// KilledBySignal constructor. -KilledBySignal::KilledBySignal(int signum) : signum_(signum) { -} - -// KilledBySignal function-call operator. -bool KilledBySignal::operator()(int exit_status) const { - return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; -} -# endif // !GTEST_OS_WINDOWS - -namespace internal { - -// Utilities needed for death tests. - -// Generates a textual description of a given exit code, in the format -// specified by wait(2). -static std::string ExitSummary(int exit_code) { - Message m; - -# if GTEST_OS_WINDOWS - - m << "Exited with exit status " << exit_code; - -# else - - if (WIFEXITED(exit_code)) { - m << "Exited with exit status " << WEXITSTATUS(exit_code); - } else if (WIFSIGNALED(exit_code)) { - m << "Terminated by signal " << WTERMSIG(exit_code); - } -# ifdef WCOREDUMP - if (WCOREDUMP(exit_code)) { - m << " (core dumped)"; - } -# endif -# endif // GTEST_OS_WINDOWS - - return m.GetString(); -} - -// Returns true if exit_status describes a process that was terminated -// by a signal, or exited normally with a nonzero exit code. -bool ExitedUnsuccessfully(int exit_status) { - return !ExitedWithCode(0)(exit_status); -} - -# if !GTEST_OS_WINDOWS -// Generates a textual failure message when a death test finds more than -// one thread running, or cannot determine the number of threads, prior -// to executing the given statement. It is the responsibility of the -// caller not to pass a thread_count of 1. -static std::string DeathTestThreadWarning(size_t thread_count) { - Message msg; - msg << "Death tests use fork(), which is unsafe particularly" - << " in a threaded context. For this test, " << GTEST_NAME_ << " "; - if (thread_count == 0) - msg << "couldn't detect the number of threads."; - else - msg << "detected " << thread_count << " threads."; - return msg.GetString(); -} -# endif // !GTEST_OS_WINDOWS - -// Flag characters for reporting a death test that did not die. -static const char kDeathTestLived = 'L'; -static const char kDeathTestReturned = 'R'; -static const char kDeathTestThrew = 'T'; -static const char kDeathTestInternalError = 'I'; - -// An enumeration describing all of the possible ways that a death test can -// conclude. DIED means that the process died while executing the test -// code; LIVED means that process lived beyond the end of the test code; -// RETURNED means that the test statement attempted to execute a return -// statement, which is not allowed; THREW means that the test statement -// returned control by throwing an exception. IN_PROGRESS means the test -// has not yet concluded. -// TODO(vladl@google.com): Unify names and possibly values for -// AbortReason, DeathTestOutcome, and flag characters above. -enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; - -// Routine for aborting the program which is safe to call from an -// exec-style death test child process, in which case the error -// message is propagated back to the parent process. Otherwise, the -// message is simply printed to stderr. In either case, the program -// then exits with status 1. -void DeathTestAbort(const std::string& message) { - // On a POSIX system, this function may be called from a threadsafe-style - // death test child process, which operates on a very small stack. Use - // the heap for any additional non-minuscule memory requirements. - const InternalRunDeathTestFlag* const flag = - GetUnitTestImpl()->internal_run_death_test_flag(); - if (flag != NULL) { - FILE* parent = posix::FDOpen(flag->write_fd(), "w"); - fputc(kDeathTestInternalError, parent); - fprintf(parent, "%s", message.c_str()); - fflush(parent); - _exit(1); - } else { - fprintf(stderr, "%s", message.c_str()); - fflush(stderr); - posix::Abort(); - } -} - -// A replacement for CHECK that calls DeathTestAbort if the assertion -// fails. -# define GTEST_DEATH_TEST_CHECK_(expression) \ - do { \ - if (!::testing::internal::IsTrue(expression)) { \ - DeathTestAbort( \ - ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ - + ::testing::internal::StreamableToString(__LINE__) + ": " \ - + #expression); \ - } \ - } while (::testing::internal::AlwaysFalse()) - -// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for -// evaluating any system call that fulfills two conditions: it must return -// -1 on failure, and set errno to EINTR when it is interrupted and -// should be tried again. The macro expands to a loop that repeatedly -// evaluates the expression as long as it evaluates to -1 and sets -// errno to EINTR. If the expression evaluates to -1 but errno is -// something other than EINTR, DeathTestAbort is called. -# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ - do { \ - int gtest_retval; \ - do { \ - gtest_retval = (expression); \ - } while (gtest_retval == -1 && errno == EINTR); \ - if (gtest_retval == -1) { \ - DeathTestAbort( \ - ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ - + ::testing::internal::StreamableToString(__LINE__) + ": " \ - + #expression + " != -1"); \ - } \ - } while (::testing::internal::AlwaysFalse()) - -// Returns the message describing the last system error in errno. -std::string GetLastErrnoDescription() { - return errno == 0 ? "" : posix::StrError(errno); -} - -// This is called from a death test parent process to read a failure -// message from the death test child process and log it with the FATAL -// severity. On Windows, the message is read from a pipe handle. On other -// platforms, it is read from a file descriptor. -static void FailFromInternalError(int fd) { - Message error; - char buffer[256]; - int num_read; - - do { - while ((num_read = posix::Read(fd, buffer, 255)) > 0) { - buffer[num_read] = '\0'; - error << buffer; - } - } while (num_read == -1 && errno == EINTR); - - if (num_read == 0) { - GTEST_LOG_(FATAL) << error.GetString(); - } else { - const int last_error = errno; - GTEST_LOG_(FATAL) << "Error while reading death test internal: " - << GetLastErrnoDescription() << " [" << last_error << "]"; - } -} - -// Death test constructor. Increments the running death test count -// for the current test. -DeathTest::DeathTest() { - TestInfo* const info = GetUnitTestImpl()->current_test_info(); - if (info == NULL) { - DeathTestAbort("Cannot run a death test outside of a TEST or " - "TEST_F construct"); - } -} - -// Creates and returns a death test by dispatching to the current -// death test factory. -bool DeathTest::Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test) { - return GetUnitTestImpl()->death_test_factory()->Create( - statement, regex, file, line, test); -} - -const char* DeathTest::LastMessage() { - return last_death_test_message_.c_str(); -} - -void DeathTest::set_last_death_test_message(const std::string& message) { - last_death_test_message_ = message; -} - -std::string DeathTest::last_death_test_message_; - -// Provides cross platform implementation for some death functionality. -class DeathTestImpl : public DeathTest { - protected: - DeathTestImpl(const char* a_statement, const RE* a_regex) - : statement_(a_statement), - regex_(a_regex), - spawned_(false), - status_(-1), - outcome_(IN_PROGRESS), - read_fd_(-1), - write_fd_(-1) {} - - // read_fd_ is expected to be closed and cleared by a derived class. - ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } - - void Abort(AbortReason reason); - virtual bool Passed(bool status_ok); - - const char* statement() const { return statement_; } - const RE* regex() const { return regex_; } - bool spawned() const { return spawned_; } - void set_spawned(bool is_spawned) { spawned_ = is_spawned; } - int status() const { return status_; } - void set_status(int a_status) { status_ = a_status; } - DeathTestOutcome outcome() const { return outcome_; } - void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; } - int read_fd() const { return read_fd_; } - void set_read_fd(int fd) { read_fd_ = fd; } - int write_fd() const { return write_fd_; } - void set_write_fd(int fd) { write_fd_ = fd; } - - // Called in the parent process only. Reads the result code of the death - // test child process via a pipe, interprets it to set the outcome_ - // member, and closes read_fd_. Outputs diagnostics and terminates in - // case of unexpected codes. - void ReadAndInterpretStatusByte(); - - private: - // The textual content of the code this object is testing. This class - // doesn't own this string and should not attempt to delete it. - const char* const statement_; - // The regular expression which test output must match. DeathTestImpl - // doesn't own this object and should not attempt to delete it. - const RE* const regex_; - // True if the death test child process has been successfully spawned. - bool spawned_; - // The exit status of the child process. - int status_; - // How the death test concluded. - DeathTestOutcome outcome_; - // Descriptor to the read end of the pipe to the child process. It is - // always -1 in the child process. The child keeps its write end of the - // pipe in write_fd_. - int read_fd_; - // Descriptor to the child's write end of the pipe to the parent process. - // It is always -1 in the parent process. The parent keeps its end of the - // pipe in read_fd_. - int write_fd_; -}; - -// Called in the parent process only. Reads the result code of the death -// test child process via a pipe, interprets it to set the outcome_ -// member, and closes read_fd_. Outputs diagnostics and terminates in -// case of unexpected codes. -void DeathTestImpl::ReadAndInterpretStatusByte() { - char flag; - int bytes_read; - - // The read() here blocks until data is available (signifying the - // failure of the death test) or until the pipe is closed (signifying - // its success), so it's okay to call this in the parent before - // the child process has exited. - do { - bytes_read = posix::Read(read_fd(), &flag, 1); - } while (bytes_read == -1 && errno == EINTR); - - if (bytes_read == 0) { - set_outcome(DIED); - } else if (bytes_read == 1) { - switch (flag) { - case kDeathTestReturned: - set_outcome(RETURNED); - break; - case kDeathTestThrew: - set_outcome(THREW); - break; - case kDeathTestLived: - set_outcome(LIVED); - break; - case kDeathTestInternalError: - FailFromInternalError(read_fd()); // Does not return. - break; - default: - GTEST_LOG_(FATAL) << "Death test child process reported " - << "unexpected status byte (" - << static_cast(flag) << ")"; - } - } else { - GTEST_LOG_(FATAL) << "Read from death test child process failed: " - << GetLastErrnoDescription(); - } - GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd())); - set_read_fd(-1); -} - -// Signals that the death test code which should have exited, didn't. -// Should be called only in a death test child process. -// Writes a status byte to the child's status file descriptor, then -// calls _exit(1). -void DeathTestImpl::Abort(AbortReason reason) { - // The parent process considers the death test to be a failure if - // it finds any data in our pipe. So, here we write a single flag byte - // to the pipe, then exit. - const char status_ch = - reason == TEST_DID_NOT_DIE ? kDeathTestLived : - reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned; - - GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); - // We are leaking the descriptor here because on some platforms (i.e., - // when built as Windows DLL), destructors of global objects will still - // run after calling _exit(). On such systems, write_fd_ will be - // indirectly closed from the destructor of UnitTestImpl, causing double - // close if it is also closed here. On debug configurations, double close - // may assert. As there are no in-process buffers to flush here, we are - // relying on the OS to close the descriptor after the process terminates - // when the destructors are not run. - _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash) -} - -// Returns an indented copy of stderr output for a death test. -// This makes distinguishing death test output lines from regular log lines -// much easier. -static ::std::string FormatDeathTestOutput(const ::std::string& output) { - ::std::string ret; - for (size_t at = 0; ; ) { - const size_t line_end = output.find('\n', at); - ret += "[ DEATH ] "; - if (line_end == ::std::string::npos) { - ret += output.substr(at); - break; - } - ret += output.substr(at, line_end + 1 - at); - at = line_end + 1; - } - return ret; -} - -// Assesses the success or failure of a death test, using both private -// members which have previously been set, and one argument: -// -// Private data members: -// outcome: An enumeration describing how the death test -// concluded: DIED, LIVED, THREW, or RETURNED. The death test -// fails in the latter three cases. -// status: The exit status of the child process. On *nix, it is in the -// in the format specified by wait(2). On Windows, this is the -// value supplied to the ExitProcess() API or a numeric code -// of the exception that terminated the program. -// regex: A regular expression object to be applied to -// the test's captured standard error output; the death test -// fails if it does not match. -// -// Argument: -// status_ok: true if exit_status is acceptable in the context of -// this particular death test, which fails if it is false -// -// Returns true iff all of the above conditions are met. Otherwise, the -// first failing condition, in the order given above, is the one that is -// reported. Also sets the last death test message string. -bool DeathTestImpl::Passed(bool status_ok) { - if (!spawned()) - return false; - - const std::string error_message = GetCapturedStderr(); - - bool success = false; - Message buffer; - - buffer << "Death test: " << statement() << "\n"; - switch (outcome()) { - case LIVED: - buffer << " Result: failed to die.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); - break; - case THREW: - buffer << " Result: threw an exception.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); - break; - case RETURNED: - buffer << " Result: illegal return in test statement.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); - break; - case DIED: - if (status_ok) { - const bool matched = RE::PartialMatch(error_message.c_str(), *regex()); - if (matched) { - success = true; - } else { - buffer << " Result: died but not with expected error.\n" - << " Expected: " << regex()->pattern() << "\n" - << "Actual msg:\n" << FormatDeathTestOutput(error_message); - } - } else { - buffer << " Result: died but not with expected exit code:\n" - << " " << ExitSummary(status()) << "\n" - << "Actual msg:\n" << FormatDeathTestOutput(error_message); - } - break; - case IN_PROGRESS: - default: - GTEST_LOG_(FATAL) - << "DeathTest::Passed somehow called before conclusion of test"; - } - - DeathTest::set_last_death_test_message(buffer.GetString()); - return success; -} - -# if GTEST_OS_WINDOWS -// WindowsDeathTest implements death tests on Windows. Due to the -// specifics of starting new processes on Windows, death tests there are -// always threadsafe, and Google Test considers the -// --gtest_death_test_style=fast setting to be equivalent to -// --gtest_death_test_style=threadsafe there. -// -// A few implementation notes: Like the Linux version, the Windows -// implementation uses pipes for child-to-parent communication. But due to -// the specifics of pipes on Windows, some extra steps are required: -// -// 1. The parent creates a communication pipe and stores handles to both -// ends of it. -// 2. The parent starts the child and provides it with the information -// necessary to acquire the handle to the write end of the pipe. -// 3. The child acquires the write end of the pipe and signals the parent -// using a Windows event. -// 4. Now the parent can release the write end of the pipe on its side. If -// this is done before step 3, the object's reference count goes down to -// 0 and it is destroyed, preventing the child from acquiring it. The -// parent now has to release it, or read operations on the read end of -// the pipe will not return when the child terminates. -// 5. The parent reads child's output through the pipe (outcome code and -// any possible error messages) from the pipe, and its stderr and then -// determines whether to fail the test. -// -// Note: to distinguish Win32 API calls from the local method and function -// calls, the former are explicitly resolved in the global namespace. -// -class WindowsDeathTest : public DeathTestImpl { - public: - WindowsDeathTest(const char* a_statement, - const RE* a_regex, - const char* file, - int line) - : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {} - - // All of these virtual functions are inherited from DeathTest. - virtual int Wait(); - virtual TestRole AssumeRole(); - - private: - // The name of the file in which the death test is located. - const char* const file_; - // The line number on which the death test is located. - const int line_; - // Handle to the write end of the pipe to the child process. - AutoHandle write_handle_; - // Child process handle. - AutoHandle child_handle_; - // Event the child process uses to signal the parent that it has - // acquired the handle to the write end of the pipe. After seeing this - // event the parent can release its own handles to make sure its - // ReadFile() calls return when the child terminates. - AutoHandle event_handle_; -}; - -// Waits for the child in a death test to exit, returning its exit -// status, or 0 if no child process exists. As a side effect, sets the -// outcome data member. -int WindowsDeathTest::Wait() { - if (!spawned()) - return 0; - - // Wait until the child either signals that it has acquired the write end - // of the pipe or it dies. - const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() }; - switch (::WaitForMultipleObjects(2, - wait_handles, - FALSE, // Waits for any of the handles. - INFINITE)) { - case WAIT_OBJECT_0: - case WAIT_OBJECT_0 + 1: - break; - default: - GTEST_DEATH_TEST_CHECK_(false); // Should not get here. - } - - // The child has acquired the write end of the pipe or exited. - // We release the handle on our side and continue. - write_handle_.Reset(); - event_handle_.Reset(); - - ReadAndInterpretStatusByte(); - - // Waits for the child process to exit if it haven't already. This - // returns immediately if the child has already exited, regardless of - // whether previous calls to WaitForMultipleObjects synchronized on this - // handle or not. - GTEST_DEATH_TEST_CHECK_( - WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(), - INFINITE)); - DWORD status_code; - GTEST_DEATH_TEST_CHECK_( - ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); - child_handle_.Reset(); - set_status(static_cast(status_code)); - return status(); -} - -// The AssumeRole process for a Windows death test. It creates a child -// process with the same executable as the current process to run the -// death test. The child process is given the --gtest_filter and -// --gtest_internal_run_death_test flags such that it knows to run the -// current death test only. -DeathTest::TestRole WindowsDeathTest::AssumeRole() { - const UnitTestImpl* const impl = GetUnitTestImpl(); - const InternalRunDeathTestFlag* const flag = - impl->internal_run_death_test_flag(); - const TestInfo* const info = impl->current_test_info(); - const int death_test_index = info->result()->death_test_count(); - - if (flag != NULL) { - // ParseInternalRunDeathTestFlag() has performed all the necessary - // processing. - set_write_fd(flag->write_fd()); - return EXECUTE_TEST; - } - - // WindowsDeathTest uses an anonymous pipe to communicate results of - // a death test. - SECURITY_ATTRIBUTES handles_are_inheritable = { - sizeof(SECURITY_ATTRIBUTES), NULL, TRUE }; - HANDLE read_handle, write_handle; - GTEST_DEATH_TEST_CHECK_( - ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable, - 0) // Default buffer size. - != FALSE); - set_read_fd(::_open_osfhandle(reinterpret_cast(read_handle), - O_RDONLY)); - write_handle_.Reset(write_handle); - event_handle_.Reset(::CreateEvent( - &handles_are_inheritable, - TRUE, // The event will automatically reset to non-signaled state. - FALSE, // The initial state is non-signalled. - NULL)); // The even is unnamed. - GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL); - const std::string filter_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" + - info->test_case_name() + "." + info->name(); - const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + - "=" + file_ + "|" + StreamableToString(line_) + "|" + - StreamableToString(death_test_index) + "|" + - StreamableToString(static_cast(::GetCurrentProcessId())) + - // size_t has the same width as pointers on both 32-bit and 64-bit - // Windows platforms. - // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. - "|" + StreamableToString(reinterpret_cast(write_handle)) + - "|" + StreamableToString(reinterpret_cast(event_handle_.Get())); - - char executable_path[_MAX_PATH + 1]; // NOLINT - GTEST_DEATH_TEST_CHECK_( - _MAX_PATH + 1 != ::GetModuleFileNameA(NULL, - executable_path, - _MAX_PATH)); - - std::string command_line = - std::string(::GetCommandLineA()) + " " + filter_flag + " \"" + - internal_flag + "\""; - - DeathTest::set_last_death_test_message(""); - - CaptureStderr(); - // Flush the log buffers since the log streams are shared with the child. - FlushInfoLog(); - - // The child process will share the standard handles with the parent. - STARTUPINFOA startup_info; - memset(&startup_info, 0, sizeof(STARTUPINFO)); - startup_info.dwFlags = STARTF_USESTDHANDLES; - startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE); - startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE); - startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); - - PROCESS_INFORMATION process_info; - GTEST_DEATH_TEST_CHECK_(::CreateProcessA( - executable_path, - const_cast(command_line.c_str()), - NULL, // Retuned process handle is not inheritable. - NULL, // Retuned thread handle is not inheritable. - TRUE, // Child inherits all inheritable handles (for write_handle_). - 0x0, // Default creation flags. - NULL, // Inherit the parent's environment. - UnitTest::GetInstance()->original_working_dir(), - &startup_info, - &process_info) != FALSE); - child_handle_.Reset(process_info.hProcess); - ::CloseHandle(process_info.hThread); - set_spawned(true); - return OVERSEE_TEST; -} -# else // We are not on Windows. - -// ForkingDeathTest provides implementations for most of the abstract -// methods of the DeathTest interface. Only the AssumeRole method is -// left undefined. -class ForkingDeathTest : public DeathTestImpl { - public: - ForkingDeathTest(const char* statement, const RE* regex); - - // All of these virtual functions are inherited from DeathTest. - virtual int Wait(); - - protected: - void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } - - private: - // PID of child process during death test; 0 in the child process itself. - pid_t child_pid_; -}; - -// Constructs a ForkingDeathTest. -ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex) - : DeathTestImpl(a_statement, a_regex), - child_pid_(-1) {} - -// Waits for the child in a death test to exit, returning its exit -// status, or 0 if no child process exists. As a side effect, sets the -// outcome data member. -int ForkingDeathTest::Wait() { - if (!spawned()) - return 0; - - ReadAndInterpretStatusByte(); - - int status_value; - GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0)); - set_status(status_value); - return status_value; -} - -// A concrete death test class that forks, then immediately runs the test -// in the child process. -class NoExecDeathTest : public ForkingDeathTest { - public: - NoExecDeathTest(const char* a_statement, const RE* a_regex) : - ForkingDeathTest(a_statement, a_regex) { } - virtual TestRole AssumeRole(); -}; - -// The AssumeRole process for a fork-and-run death test. It implements a -// straightforward fork, with a simple pipe to transmit the status byte. -DeathTest::TestRole NoExecDeathTest::AssumeRole() { - const size_t thread_count = GetThreadCount(); - if (thread_count != 1) { - GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count); - } - - int pipe_fd[2]; - GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); - - DeathTest::set_last_death_test_message(""); - CaptureStderr(); - // When we fork the process below, the log file buffers are copied, but the - // file descriptors are shared. We flush all log files here so that closing - // the file descriptors in the child process doesn't throw off the - // synchronization between descriptors and buffers in the parent process. - // This is as close to the fork as possible to avoid a race condition in case - // there are multiple threads running before the death test, and another - // thread writes to the log file. - FlushInfoLog(); - - const pid_t child_pid = fork(); - GTEST_DEATH_TEST_CHECK_(child_pid != -1); - set_child_pid(child_pid); - if (child_pid == 0) { - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0])); - set_write_fd(pipe_fd[1]); - // Redirects all logging to stderr in the child process to prevent - // concurrent writes to the log files. We capture stderr in the parent - // process and append the child process' output to a log. - LogToStderr(); - // Event forwarding to the listeners of event listener API mush be shut - // down in death test subprocesses. - GetUnitTestImpl()->listeners()->SuppressEventForwarding(); - g_in_fast_death_test_child = true; - return EXECUTE_TEST; - } else { - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); - set_read_fd(pipe_fd[0]); - set_spawned(true); - return OVERSEE_TEST; - } -} - -// A concrete death test class that forks and re-executes the main -// program from the beginning, with command-line flags set that cause -// only this specific death test to be run. -class ExecDeathTest : public ForkingDeathTest { - public: - ExecDeathTest(const char* a_statement, const RE* a_regex, - const char* file, int line) : - ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { } - virtual TestRole AssumeRole(); - private: - static ::std::vector - GetArgvsForDeathTestChildProcess() { - ::std::vector args = GetInjectableArgvs(); - return args; - } - // The name of the file in which the death test is located. - const char* const file_; - // The line number on which the death test is located. - const int line_; -}; - -// Utility class for accumulating command-line arguments. -class Arguments { - public: - Arguments() { - args_.push_back(NULL); - } - - ~Arguments() { - for (std::vector::iterator i = args_.begin(); i != args_.end(); - ++i) { - free(*i); - } - } - void AddArgument(const char* argument) { - args_.insert(args_.end() - 1, posix::StrDup(argument)); - } - - template - void AddArguments(const ::std::vector& arguments) { - for (typename ::std::vector::const_iterator i = arguments.begin(); - i != arguments.end(); - ++i) { - args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); - } - } - char* const* Argv() { - return &args_[0]; - } - - private: - std::vector args_; -}; - -// A struct that encompasses the arguments to the child process of a -// threadsafe-style death test process. -struct ExecDeathTestArgs { - char* const* argv; // Command-line arguments for the child's call to exec - int close_fd; // File descriptor to close; the read end of a pipe -}; - -# if GTEST_OS_MAC -inline char** GetEnviron() { - // When Google Test is built as a framework on MacOS X, the environ variable - // is unavailable. Apple's documentation (man environ) recommends using - // _NSGetEnviron() instead. - return *_NSGetEnviron(); -} -# else -// Some POSIX platforms expect you to declare environ. extern "C" makes -// it reside in the global namespace. -extern "C" char** environ; -inline char** GetEnviron() { return environ; } -# endif // GTEST_OS_MAC - -# if !GTEST_OS_QNX -// The main function for a threadsafe-style death test child process. -// This function is called in a clone()-ed process and thus must avoid -// any potentially unsafe operations like malloc or libc functions. -static int ExecDeathTestChildMain(void* child_arg) { - ExecDeathTestArgs* const args = static_cast(child_arg); - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd)); - - // We need to execute the test program in the same environment where - // it was originally invoked. Therefore we change to the original - // working directory first. - const char* const original_dir = - UnitTest::GetInstance()->original_working_dir(); - // We can safely call chdir() as it's a direct system call. - if (chdir(original_dir) != 0) { - DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + - GetLastErrnoDescription()); - return EXIT_FAILURE; - } - - // We can safely call execve() as it's a direct system call. We - // cannot use execvp() as it's a libc function and thus potentially - // unsafe. Since execve() doesn't search the PATH, the user must - // invoke the test program via a valid path that contains at least - // one path separator. - execve(args->argv[0], args->argv, GetEnviron()); - DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " + - original_dir + " failed: " + - GetLastErrnoDescription()); - return EXIT_FAILURE; -} -# endif // !GTEST_OS_QNX - -// Two utility routines that together determine the direction the stack -// grows. -// This could be accomplished more elegantly by a single recursive -// function, but we want to guard against the unlikely possibility of -// a smart compiler optimizing the recursion away. -// -// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining -// StackLowerThanAddress into StackGrowsDown, which then doesn't give -// correct answer. -void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_; -void StackLowerThanAddress(const void* ptr, bool* result) { - int dummy; - *result = (&dummy < ptr); -} - -bool StackGrowsDown() { - int dummy; - bool result; - StackLowerThanAddress(&dummy, &result); - return result; -} - -// Spawns a child process with the same executable as the current process in -// a thread-safe manner and instructs it to run the death test. The -// implementation uses fork(2) + exec. On systems where clone(2) is -// available, it is used instead, being slightly more thread-safe. On QNX, -// fork supports only single-threaded environments, so this function uses -// spawn(2) there instead. The function dies with an error message if -// anything goes wrong. -static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { - ExecDeathTestArgs args = { argv, close_fd }; - pid_t child_pid = -1; - -# if GTEST_OS_QNX - // Obtains the current directory and sets it to be closed in the child - // process. - const int cwd_fd = open(".", O_RDONLY); - GTEST_DEATH_TEST_CHECK_(cwd_fd != -1); - GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC)); - // We need to execute the test program in the same environment where - // it was originally invoked. Therefore we change to the original - // working directory first. - const char* const original_dir = - UnitTest::GetInstance()->original_working_dir(); - // We can safely call chdir() as it's a direct system call. - if (chdir(original_dir) != 0) { - DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + - GetLastErrnoDescription()); - return EXIT_FAILURE; - } - - int fd_flags; - // Set close_fd to be closed after spawn. - GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); - GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD, - fd_flags | FD_CLOEXEC)); - struct inheritance inherit = {0}; - // spawn is a system call. - child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron()); - // Restores the current working directory. - GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); - -# else // GTEST_OS_QNX -# if GTEST_OS_LINUX - // When a SIGPROF signal is received while fork() or clone() are executing, - // the process may hang. To avoid this, we ignore SIGPROF here and re-enable - // it after the call to fork()/clone() is complete. - struct sigaction saved_sigprof_action; - struct sigaction ignore_sigprof_action; - memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); - sigemptyset(&ignore_sigprof_action.sa_mask); - ignore_sigprof_action.sa_handler = SIG_IGN; - GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction( - SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); -# endif // GTEST_OS_LINUX - -# if GTEST_HAS_CLONE - const bool use_fork = GTEST_FLAG(death_test_use_fork); - - if (!use_fork) { - static const bool stack_grows_down = StackGrowsDown(); - const size_t stack_size = getpagesize(); - // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. - void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_PRIVATE, -1, 0); - GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); - - // Maximum stack alignment in bytes: For a downward-growing stack, this - // amount is subtracted from size of the stack space to get an address - // that is within the stack space and is aligned on all systems we care - // about. As far as I know there is no ABI with stack alignment greater - // than 64. We assume stack and stack_size already have alignment of - // kMaxStackAlignment. - const size_t kMaxStackAlignment = 64; - void* const stack_top = - static_cast(stack) + - (stack_grows_down ? stack_size - kMaxStackAlignment : 0); - GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment && - reinterpret_cast(stack_top) % kMaxStackAlignment == 0); - - child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); - - GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); - } -# else - const bool use_fork = true; -# endif // GTEST_HAS_CLONE - - if (use_fork && (child_pid = fork()) == 0) { - ExecDeathTestChildMain(&args); - _exit(0); - } -# endif // GTEST_OS_QNX -# if GTEST_OS_LINUX - GTEST_DEATH_TEST_CHECK_SYSCALL_( - sigaction(SIGPROF, &saved_sigprof_action, NULL)); -# endif // GTEST_OS_LINUX - - GTEST_DEATH_TEST_CHECK_(child_pid != -1); - return child_pid; -} - -// The AssumeRole process for a fork-and-exec death test. It re-executes the -// main program from the beginning, setting the --gtest_filter -// and --gtest_internal_run_death_test flags to cause only the current -// death test to be re-run. -DeathTest::TestRole ExecDeathTest::AssumeRole() { - const UnitTestImpl* const impl = GetUnitTestImpl(); - const InternalRunDeathTestFlag* const flag = - impl->internal_run_death_test_flag(); - const TestInfo* const info = impl->current_test_info(); - const int death_test_index = info->result()->death_test_count(); - - if (flag != NULL) { - set_write_fd(flag->write_fd()); - return EXECUTE_TEST; - } - - int pipe_fd[2]; - GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); - // Clear the close-on-exec flag on the write end of the pipe, lest - // it be closed when the child process does an exec: - GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); - - const std::string filter_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" - + info->test_case_name() + "." + info->name(); - const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" - + file_ + "|" + StreamableToString(line_) + "|" - + StreamableToString(death_test_index) + "|" - + StreamableToString(pipe_fd[1]); - Arguments args; - args.AddArguments(GetArgvsForDeathTestChildProcess()); - args.AddArgument(filter_flag.c_str()); - args.AddArgument(internal_flag.c_str()); - - DeathTest::set_last_death_test_message(""); - - CaptureStderr(); - // See the comment in NoExecDeathTest::AssumeRole for why the next line - // is necessary. - FlushInfoLog(); - - const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]); - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); - set_child_pid(child_pid); - set_read_fd(pipe_fd[0]); - set_spawned(true); - return OVERSEE_TEST; -} - -# endif // !GTEST_OS_WINDOWS - -// Creates a concrete DeathTest-derived class that depends on the -// --gtest_death_test_style flag, and sets the pointer pointed to -// by the "test" argument to its address. If the test should be -// skipped, sets that pointer to NULL. Returns true, unless the -// flag is set to an invalid value. -bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, - const char* file, int line, - DeathTest** test) { - UnitTestImpl* const impl = GetUnitTestImpl(); - const InternalRunDeathTestFlag* const flag = - impl->internal_run_death_test_flag(); - const int death_test_index = impl->current_test_info() - ->increment_death_test_count(); - - if (flag != NULL) { - if (death_test_index > flag->index()) { - DeathTest::set_last_death_test_message( - "Death test count (" + StreamableToString(death_test_index) - + ") somehow exceeded expected maximum (" - + StreamableToString(flag->index()) + ")"); - return false; - } - - if (!(flag->file() == file && flag->line() == line && - flag->index() == death_test_index)) { - *test = NULL; - return true; - } - } - -# if GTEST_OS_WINDOWS - - if (GTEST_FLAG(death_test_style) == "threadsafe" || - GTEST_FLAG(death_test_style) == "fast") { - *test = new WindowsDeathTest(statement, regex, file, line); - } - -# else - - if (GTEST_FLAG(death_test_style) == "threadsafe") { - *test = new ExecDeathTest(statement, regex, file, line); - } else if (GTEST_FLAG(death_test_style) == "fast") { - *test = new NoExecDeathTest(statement, regex); - } - -# endif // GTEST_OS_WINDOWS - - else { // NOLINT - this is more readable than unbalanced brackets inside #if. - DeathTest::set_last_death_test_message( - "Unknown death test style \"" + GTEST_FLAG(death_test_style) - + "\" encountered"); - return false; - } - - return true; -} - -// Splits a given string on a given delimiter, populating a given -// vector with the fields. GTEST_HAS_DEATH_TEST implies that we have -// ::std::string, so we can use it here. -static void SplitString(const ::std::string& str, char delimiter, - ::std::vector< ::std::string>* dest) { - ::std::vector< ::std::string> parsed; - ::std::string::size_type pos = 0; - while (::testing::internal::AlwaysTrue()) { - const ::std::string::size_type colon = str.find(delimiter, pos); - if (colon == ::std::string::npos) { - parsed.push_back(str.substr(pos)); - break; - } else { - parsed.push_back(str.substr(pos, colon - pos)); - pos = colon + 1; - } - } - dest->swap(parsed); -} - -# if GTEST_OS_WINDOWS -// Recreates the pipe and event handles from the provided parameters, -// signals the event, and returns a file descriptor wrapped around the pipe -// handle. This function is called in the child process only. -int GetStatusFileDescriptor(unsigned int parent_process_id, - size_t write_handle_as_size_t, - size_t event_handle_as_size_t) { - AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, - FALSE, // Non-inheritable. - parent_process_id)); - if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { - DeathTestAbort("Unable to open parent process " + - StreamableToString(parent_process_id)); - } - - // TODO(vladl@google.com): Replace the following check with a - // compile-time assertion when available. - GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); - - const HANDLE write_handle = - reinterpret_cast(write_handle_as_size_t); - HANDLE dup_write_handle; - - // The newly initialized handle is accessible only in in the parent - // process. To obtain one accessible within the child, we need to use - // DuplicateHandle. - if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, - ::GetCurrentProcess(), &dup_write_handle, - 0x0, // Requested privileges ignored since - // DUPLICATE_SAME_ACCESS is used. - FALSE, // Request non-inheritable handler. - DUPLICATE_SAME_ACCESS)) { - DeathTestAbort("Unable to duplicate the pipe handle " + - StreamableToString(write_handle_as_size_t) + - " from the parent process " + - StreamableToString(parent_process_id)); - } - - const HANDLE event_handle = reinterpret_cast(event_handle_as_size_t); - HANDLE dup_event_handle; - - if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, - ::GetCurrentProcess(), &dup_event_handle, - 0x0, - FALSE, - DUPLICATE_SAME_ACCESS)) { - DeathTestAbort("Unable to duplicate the event handle " + - StreamableToString(event_handle_as_size_t) + - " from the parent process " + - StreamableToString(parent_process_id)); - } - - const int write_fd = - ::_open_osfhandle(reinterpret_cast(dup_write_handle), O_APPEND); - if (write_fd == -1) { - DeathTestAbort("Unable to convert pipe handle " + - StreamableToString(write_handle_as_size_t) + - " to a file descriptor"); - } - - // Signals the parent that the write end of the pipe has been acquired - // so the parent can release its own write end. - ::SetEvent(dup_event_handle); - - return write_fd; -} -# endif // GTEST_OS_WINDOWS - -// Returns a newly created InternalRunDeathTestFlag object with fields -// initialized from the GTEST_FLAG(internal_run_death_test) flag if -// the flag is specified; otherwise returns NULL. -InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { - if (GTEST_FLAG(internal_run_death_test) == "") return NULL; - - // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we - // can use it here. - int line = -1; - int index = -1; - ::std::vector< ::std::string> fields; - SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields); - int write_fd = -1; - -# if GTEST_OS_WINDOWS - - unsigned int parent_process_id = 0; - size_t write_handle_as_size_t = 0; - size_t event_handle_as_size_t = 0; - - if (fields.size() != 6 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index) - || !ParseNaturalNumber(fields[3], &parent_process_id) - || !ParseNaturalNumber(fields[4], &write_handle_as_size_t) - || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { - DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + - GTEST_FLAG(internal_run_death_test)); - } - write_fd = GetStatusFileDescriptor(parent_process_id, - write_handle_as_size_t, - event_handle_as_size_t); -# else - - if (fields.size() != 4 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index) - || !ParseNaturalNumber(fields[3], &write_fd)) { - DeathTestAbort("Bad --gtest_internal_run_death_test flag: " - + GTEST_FLAG(internal_run_death_test)); - } - -# endif // GTEST_OS_WINDOWS - - return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); -} - -} // namespace internal - -#endif // GTEST_HAS_DEATH_TEST - -} // namespace testing -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: keith.ray@gmail.com (Keith Ray) - - -#include - -#if GTEST_OS_WINDOWS_MOBILE -# include -#elif GTEST_OS_WINDOWS -# include -# include -#elif GTEST_OS_SYMBIAN -// Symbian OpenC has PATH_MAX in sys/syslimits.h -# include -#else -# include -# include // Some Linux distributions define PATH_MAX here. -#endif // GTEST_OS_WINDOWS_MOBILE - -#if GTEST_OS_WINDOWS -# define GTEST_PATH_MAX_ _MAX_PATH -#elif defined(PATH_MAX) -# define GTEST_PATH_MAX_ PATH_MAX -#elif defined(_XOPEN_PATH_MAX) -# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX -#else -# define GTEST_PATH_MAX_ _POSIX_PATH_MAX -#endif // GTEST_OS_WINDOWS - - -namespace testing { -namespace internal { - -#if GTEST_OS_WINDOWS -// On Windows, '\\' is the standard path separator, but many tools and the -// Windows API also accept '/' as an alternate path separator. Unless otherwise -// noted, a file path can contain either kind of path separators, or a mixture -// of them. -const char kPathSeparator = '\\'; -const char kAlternatePathSeparator = '/'; -const char kAlternatePathSeparatorString[] = "/"; -# if GTEST_OS_WINDOWS_MOBILE -// Windows CE doesn't have a current directory. You should not use -// the current directory in tests on Windows CE, but this at least -// provides a reasonable fallback. -const char kCurrentDirectoryString[] = "\\"; -// Windows CE doesn't define INVALID_FILE_ATTRIBUTES -const DWORD kInvalidFileAttributes = 0xffffffff; -# else -const char kCurrentDirectoryString[] = ".\\"; -# endif // GTEST_OS_WINDOWS_MOBILE -#else -const char kPathSeparator = '/'; -const char kCurrentDirectoryString[] = "./"; -#endif // GTEST_OS_WINDOWS - -// Returns whether the given character is a valid path separator. -static bool IsPathSeparator(char c) { -#if GTEST_HAS_ALT_PATH_SEP_ - return (c == kPathSeparator) || (c == kAlternatePathSeparator); -#else - return c == kPathSeparator; -#endif -} - -// Returns the current working directory, or "" if unsuccessful. -FilePath FilePath::GetCurrentDir() { -#if GTEST_OS_WINDOWS_MOBILE - // Windows CE doesn't have a current directory, so we just return - // something reasonable. - return FilePath(kCurrentDirectoryString); -#elif GTEST_OS_WINDOWS - char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; - return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); -#else - char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; - return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); -#endif // GTEST_OS_WINDOWS_MOBILE -} - -// Returns a copy of the FilePath with the case-insensitive extension removed. -// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns -// FilePath("dir/file"). If a case-insensitive extension is not -// found, returns a copy of the original FilePath. -FilePath FilePath::RemoveExtension(const char* extension) const { - const std::string dot_extension = std::string(".") + extension; - if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { - return FilePath(pathname_.substr( - 0, pathname_.length() - dot_extension.length())); - } - return *this; -} - -// Returns a pointer to the last occurence of a valid path separator in -// the FilePath. On Windows, for example, both '/' and '\' are valid path -// separators. Returns NULL if no path separator was found. -const char* FilePath::FindLastPathSeparator() const { - const char* const last_sep = strrchr(c_str(), kPathSeparator); -#if GTEST_HAS_ALT_PATH_SEP_ - const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); - // Comparing two pointers of which only one is NULL is undefined. - if (last_alt_sep != NULL && - (last_sep == NULL || last_alt_sep > last_sep)) { - return last_alt_sep; - } -#endif - return last_sep; -} - -// Returns a copy of the FilePath with the directory part removed. -// Example: FilePath("path/to/file").RemoveDirectoryName() returns -// FilePath("file"). If there is no directory part ("just_a_file"), it returns -// the FilePath unmodified. If there is no file part ("just_a_dir/") it -// returns an empty FilePath (""). -// On Windows platform, '\' is the path separator, otherwise it is '/'. -FilePath FilePath::RemoveDirectoryName() const { - const char* const last_sep = FindLastPathSeparator(); - return last_sep ? FilePath(last_sep + 1) : *this; -} - -// RemoveFileName returns the directory path with the filename removed. -// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". -// If the FilePath is "a_file" or "/a_file", RemoveFileName returns -// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does -// not have a file, like "just/a/dir/", it returns the FilePath unmodified. -// On Windows platform, '\' is the path separator, otherwise it is '/'. -FilePath FilePath::RemoveFileName() const { - const char* const last_sep = FindLastPathSeparator(); - std::string dir; - if (last_sep) { - dir = std::string(c_str(), last_sep + 1 - c_str()); - } else { - dir = kCurrentDirectoryString; - } - return FilePath(dir); -} - -// Helper functions for naming files in a directory for xml output. - -// Given directory = "dir", base_name = "test", number = 0, -// extension = "xml", returns "dir/test.xml". If number is greater -// than zero (e.g., 12), returns "dir/test_12.xml". -// On Windows platform, uses \ as the separator rather than /. -FilePath FilePath::MakeFileName(const FilePath& directory, - const FilePath& base_name, - int number, - const char* extension) { - std::string file; - if (number == 0) { - file = base_name.string() + "." + extension; - } else { - file = base_name.string() + "_" + StreamableToString(number) - + "." + extension; - } - return ConcatPaths(directory, FilePath(file)); -} - -// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml". -// On Windows, uses \ as the separator rather than /. -FilePath FilePath::ConcatPaths(const FilePath& directory, - const FilePath& relative_path) { - if (directory.IsEmpty()) - return relative_path; - const FilePath dir(directory.RemoveTrailingPathSeparator()); - return FilePath(dir.string() + kPathSeparator + relative_path.string()); -} - -// Returns true if pathname describes something findable in the file-system, -// either a file, directory, or whatever. -bool FilePath::FileOrDirectoryExists() const { -#if GTEST_OS_WINDOWS_MOBILE - LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); - const DWORD attributes = GetFileAttributes(unicode); - delete [] unicode; - return attributes != kInvalidFileAttributes; -#else - posix::StatStruct file_stat; - return posix::Stat(pathname_.c_str(), &file_stat) == 0; -#endif // GTEST_OS_WINDOWS_MOBILE -} - -// Returns true if pathname describes a directory in the file-system -// that exists. -bool FilePath::DirectoryExists() const { - bool result = false; -#if GTEST_OS_WINDOWS - // Don't strip off trailing separator if path is a root directory on - // Windows (like "C:\\"). - const FilePath& path(IsRootDirectory() ? *this : - RemoveTrailingPathSeparator()); -#else - const FilePath& path(*this); -#endif - -#if GTEST_OS_WINDOWS_MOBILE - LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); - const DWORD attributes = GetFileAttributes(unicode); - delete [] unicode; - if ((attributes != kInvalidFileAttributes) && - (attributes & FILE_ATTRIBUTE_DIRECTORY)) { - result = true; - } -#else - posix::StatStruct file_stat; - result = posix::Stat(path.c_str(), &file_stat) == 0 && - posix::IsDir(file_stat); -#endif // GTEST_OS_WINDOWS_MOBILE - - return result; -} - -// Returns true if pathname describes a root directory. (Windows has one -// root directory per disk drive.) -bool FilePath::IsRootDirectory() const { -#if GTEST_OS_WINDOWS - // TODO(wan@google.com): on Windows a network share like - // \\server\share can be a root directory, although it cannot be the - // current directory. Handle this properly. - return pathname_.length() == 3 && IsAbsolutePath(); -#else - return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); -#endif -} - -// Returns true if pathname describes an absolute path. -bool FilePath::IsAbsolutePath() const { - const char* const name = pathname_.c_str(); -#if GTEST_OS_WINDOWS - return pathname_.length() >= 3 && - ((name[0] >= 'a' && name[0] <= 'z') || - (name[0] >= 'A' && name[0] <= 'Z')) && - name[1] == ':' && - IsPathSeparator(name[2]); -#else - return IsPathSeparator(name[0]); -#endif -} - -// Returns a pathname for a file that does not currently exist. The pathname -// will be directory/base_name.extension or -// directory/base_name_.extension if directory/base_name.extension -// already exists. The number will be incremented until a pathname is found -// that does not already exist. -// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. -// There could be a race condition if two or more processes are calling this -// function at the same time -- they could both pick the same filename. -FilePath FilePath::GenerateUniqueFileName(const FilePath& directory, - const FilePath& base_name, - const char* extension) { - FilePath full_pathname; - int number = 0; - do { - full_pathname.Set(MakeFileName(directory, base_name, number++, extension)); - } while (full_pathname.FileOrDirectoryExists()); - return full_pathname; -} - -// Returns true if FilePath ends with a path separator, which indicates that -// it is intended to represent a directory. Returns false otherwise. -// This does NOT check that a directory (or file) actually exists. -bool FilePath::IsDirectory() const { - return !pathname_.empty() && - IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]); -} - -// Create directories so that path exists. Returns true if successful or if -// the directories already exist; returns false if unable to create directories -// for any reason. -bool FilePath::CreateDirectoriesRecursively() const { - if (!this->IsDirectory()) { - return false; - } - - if (pathname_.length() == 0 || this->DirectoryExists()) { - return true; - } - - const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName()); - return parent.CreateDirectoriesRecursively() && this->CreateFolder(); -} - -// Create the directory so that path exists. Returns true if successful or -// if the directory already exists; returns false if unable to create the -// directory for any reason, including if the parent directory does not -// exist. Not named "CreateDirectory" because that's a macro on Windows. -bool FilePath::CreateFolder() const { -#if GTEST_OS_WINDOWS_MOBILE - FilePath removed_sep(this->RemoveTrailingPathSeparator()); - LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); - int result = CreateDirectory(unicode, NULL) ? 0 : -1; - delete [] unicode; -#elif GTEST_OS_WINDOWS - int result = _mkdir(pathname_.c_str()); -#else - int result = mkdir(pathname_.c_str(), 0777); -#endif // GTEST_OS_WINDOWS_MOBILE - - if (result == -1) { - return this->DirectoryExists(); // An error is OK if the directory exists. - } - return true; // No error. -} - -// If input name has a trailing separator character, remove it and return the -// name, otherwise return the name string unmodified. -// On Windows platform, uses \ as the separator, other platforms use /. -FilePath FilePath::RemoveTrailingPathSeparator() const { - return IsDirectory() - ? FilePath(pathname_.substr(0, pathname_.length() - 1)) - : *this; -} - -// Removes any redundant separators that might be in the pathname. -// For example, "bar///foo" becomes "bar/foo". Does not eliminate other -// redundancies that might be in a pathname involving "." or "..". -// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share). -void FilePath::Normalize() { - if (pathname_.c_str() == NULL) { - pathname_ = ""; - return; - } - const char* src = pathname_.c_str(); - char* const dest = new char[pathname_.length() + 1]; - char* dest_ptr = dest; - memset(dest_ptr, 0, pathname_.length() + 1); - - while (*src != '\0') { - *dest_ptr = *src; - if (!IsPathSeparator(*src)) { - src++; - } else { -#if GTEST_HAS_ALT_PATH_SEP_ - if (*dest_ptr == kAlternatePathSeparator) { - *dest_ptr = kPathSeparator; - } -#endif - while (IsPathSeparator(*src)) - src++; - } - dest_ptr++; - } - *dest_ptr = '\0'; - pathname_ = dest; - delete[] dest; -} - -} // namespace internal -} // namespace testing -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - - -#include -#include -#include -#include - -#if GTEST_OS_WINDOWS_MOBILE -# include // For TerminateProcess() -#elif GTEST_OS_WINDOWS -# include -# include -#else -# include -#endif // GTEST_OS_WINDOWS_MOBILE - -#if GTEST_OS_MAC -# include -# include -# include -#endif // GTEST_OS_MAC - -#if GTEST_OS_QNX -# include -# include -#endif // GTEST_OS_QNX - - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 -#undef GTEST_IMPLEMENTATION_ - -namespace testing { -namespace internal { - -#if defined(_MSC_VER) || defined(__BORLANDC__) -// MSVC and C++Builder do not provide a definition of STDERR_FILENO. -const int kStdOutFileno = 1; -const int kStdErrFileno = 2; -#else -const int kStdOutFileno = STDOUT_FILENO; -const int kStdErrFileno = STDERR_FILENO; -#endif // _MSC_VER - -#if GTEST_OS_MAC - -// Returns the number of threads running in the process, or 0 to indicate that -// we cannot detect it. -size_t GetThreadCount() { - const task_t task = mach_task_self(); - mach_msg_type_number_t thread_count; - thread_act_array_t thread_list; - const kern_return_t status = task_threads(task, &thread_list, &thread_count); - if (status == KERN_SUCCESS) { - // task_threads allocates resources in thread_list and we need to free them - // to avoid leaks. - vm_deallocate(task, - reinterpret_cast(thread_list), - sizeof(thread_t) * thread_count); - return static_cast(thread_count); - } else { - return 0; - } -} - -#elif GTEST_OS_QNX - -// Returns the number of threads running in the process, or 0 to indicate that -// we cannot detect it. -size_t GetThreadCount() { - const int fd = open("/proc/self/as", O_RDONLY); - if (fd < 0) { - return 0; - } - procfs_info process_info; - const int status = - devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL); - close(fd); - if (status == EOK) { - return static_cast(process_info.num_threads); - } else { - return 0; - } -} - -#else - -size_t GetThreadCount() { - // There's no portable way to detect the number of threads, so we just - // return 0 to indicate that we cannot detect it. - return 0; -} - -#endif // GTEST_OS_MAC - -#if GTEST_USES_POSIX_RE - -// Implements RE. Currently only needed for death tests. - -RE::~RE() { - if (is_valid_) { - // regfree'ing an invalid regex might crash because the content - // of the regex is undefined. Since the regex's are essentially - // the same, one cannot be valid (or invalid) without the other - // being so too. - regfree(&partial_regex_); - regfree(&full_regex_); - } - free(const_cast(pattern_)); -} - -// Returns true iff regular expression re matches the entire str. -bool RE::FullMatch(const char* str, const RE& re) { - if (!re.is_valid_) return false; - - regmatch_t match; - return regexec(&re.full_regex_, str, 1, &match, 0) == 0; -} - -// Returns true iff regular expression re matches a substring of str -// (including str itself). -bool RE::PartialMatch(const char* str, const RE& re) { - if (!re.is_valid_) return false; - - regmatch_t match; - return regexec(&re.partial_regex_, str, 1, &match, 0) == 0; -} - -// Initializes an RE from its string representation. -void RE::Init(const char* regex) { - pattern_ = posix::StrDup(regex); - - // Reserves enough bytes to hold the regular expression used for a - // full match. - const size_t full_regex_len = strlen(regex) + 10; - char* const full_pattern = new char[full_regex_len]; - - snprintf(full_pattern, full_regex_len, "^(%s)$", regex); - is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0; - // We want to call regcomp(&partial_regex_, ...) even if the - // previous expression returns false. Otherwise partial_regex_ may - // not be properly initialized can may cause trouble when it's - // freed. - // - // Some implementation of POSIX regex (e.g. on at least some - // versions of Cygwin) doesn't accept the empty string as a valid - // regex. We change it to an equivalent form "()" to be safe. - if (is_valid_) { - const char* const partial_regex = (*regex == '\0') ? "()" : regex; - is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0; - } - EXPECT_TRUE(is_valid_) - << "Regular expression \"" << regex - << "\" is not a valid POSIX Extended regular expression."; - - delete[] full_pattern; -} - -#elif GTEST_USES_SIMPLE_RE - -// Returns true iff ch appears anywhere in str (excluding the -// terminating '\0' character). -bool IsInSet(char ch, const char* str) { - return ch != '\0' && strchr(str, ch) != NULL; -} - -// Returns true iff ch belongs to the given classification. Unlike -// similar functions in , these aren't affected by the -// current locale. -bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } -bool IsAsciiPunct(char ch) { - return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"); -} -bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } -bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } -bool IsAsciiWordChar(char ch) { - return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || - ('0' <= ch && ch <= '9') || ch == '_'; -} - -// Returns true iff "\\c" is a supported escape sequence. -bool IsValidEscape(char c) { - return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); -} - -// Returns true iff the given atom (specified by escaped and pattern) -// matches ch. The result is undefined if the atom is invalid. -bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { - if (escaped) { // "\\p" where p is pattern_char. - switch (pattern_char) { - case 'd': return IsAsciiDigit(ch); - case 'D': return !IsAsciiDigit(ch); - case 'f': return ch == '\f'; - case 'n': return ch == '\n'; - case 'r': return ch == '\r'; - case 's': return IsAsciiWhiteSpace(ch); - case 'S': return !IsAsciiWhiteSpace(ch); - case 't': return ch == '\t'; - case 'v': return ch == '\v'; - case 'w': return IsAsciiWordChar(ch); - case 'W': return !IsAsciiWordChar(ch); - } - return IsAsciiPunct(pattern_char) && pattern_char == ch; - } - - return (pattern_char == '.' && ch != '\n') || pattern_char == ch; -} - -// Helper function used by ValidateRegex() to format error messages. -std::string FormatRegexSyntaxError(const char* regex, int index) { - return (Message() << "Syntax error at index " << index - << " in simple regular expression \"" << regex << "\": ").GetString(); -} - -// Generates non-fatal failures and returns false if regex is invalid; -// otherwise returns true. -bool ValidateRegex(const char* regex) { - if (regex == NULL) { - // TODO(wan@google.com): fix the source file location in the - // assertion failures to match where the regex is used in user - // code. - ADD_FAILURE() << "NULL is not a valid simple regular expression."; - return false; - } - - bool is_valid = true; - - // True iff ?, *, or + can follow the previous atom. - bool prev_repeatable = false; - for (int i = 0; regex[i]; i++) { - if (regex[i] == '\\') { // An escape sequence - i++; - if (regex[i] == '\0') { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) - << "'\\' cannot appear at the end."; - return false; - } - - if (!IsValidEscape(regex[i])) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) - << "invalid escape sequence \"\\" << regex[i] << "\"."; - is_valid = false; - } - prev_repeatable = true; - } else { // Not an escape sequence. - const char ch = regex[i]; - - if (ch == '^' && i > 0) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'^' can only appear at the beginning."; - is_valid = false; - } else if (ch == '$' && regex[i + 1] != '\0') { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'$' can only appear at the end."; - is_valid = false; - } else if (IsInSet(ch, "()[]{}|")) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'" << ch << "' is unsupported."; - is_valid = false; - } else if (IsRepeat(ch) && !prev_repeatable) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'" << ch << "' can only follow a repeatable token."; - is_valid = false; - } - - prev_repeatable = !IsInSet(ch, "^$?*+"); - } - } - - return is_valid; -} - -// Matches a repeated regex atom followed by a valid simple regular -// expression. The regex atom is defined as c if escaped is false, -// or \c otherwise. repeat is the repetition meta character (?, *, -// or +). The behavior is undefined if str contains too many -// characters to be indexable by size_t, in which case the test will -// probably time out anyway. We are fine with this limitation as -// std::string has it too. -bool MatchRepetitionAndRegexAtHead( - bool escaped, char c, char repeat, const char* regex, - const char* str) { - const size_t min_count = (repeat == '+') ? 1 : 0; - const size_t max_count = (repeat == '?') ? 1 : - static_cast(-1) - 1; - // We cannot call numeric_limits::max() as it conflicts with the - // max() macro on Windows. - - for (size_t i = 0; i <= max_count; ++i) { - // We know that the atom matches each of the first i characters in str. - if (i >= min_count && MatchRegexAtHead(regex, str + i)) { - // We have enough matches at the head, and the tail matches too. - // Since we only care about *whether* the pattern matches str - // (as opposed to *how* it matches), there is no need to find a - // greedy match. - return true; - } - if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) - return false; - } - return false; -} - -// Returns true iff regex matches a prefix of str. regex must be a -// valid simple regular expression and not start with "^", or the -// result is undefined. -bool MatchRegexAtHead(const char* regex, const char* str) { - if (*regex == '\0') // An empty regex matches a prefix of anything. - return true; - - // "$" only matches the end of a string. Note that regex being - // valid guarantees that there's nothing after "$" in it. - if (*regex == '$') - return *str == '\0'; - - // Is the first thing in regex an escape sequence? - const bool escaped = *regex == '\\'; - if (escaped) - ++regex; - if (IsRepeat(regex[1])) { - // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so - // here's an indirect recursion. It terminates as the regex gets - // shorter in each recursion. - return MatchRepetitionAndRegexAtHead( - escaped, regex[0], regex[1], regex + 2, str); - } else { - // regex isn't empty, isn't "$", and doesn't start with a - // repetition. We match the first atom of regex with the first - // character of str and recurse. - return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && - MatchRegexAtHead(regex + 1, str + 1); - } -} - -// Returns true iff regex matches any substring of str. regex must be -// a valid simple regular expression, or the result is undefined. -// -// The algorithm is recursive, but the recursion depth doesn't exceed -// the regex length, so we won't need to worry about running out of -// stack space normally. In rare cases the time complexity can be -// exponential with respect to the regex length + the string length, -// but usually it's must faster (often close to linear). -bool MatchRegexAnywhere(const char* regex, const char* str) { - if (regex == NULL || str == NULL) - return false; - - if (*regex == '^') - return MatchRegexAtHead(regex + 1, str); - - // A successful match can be anywhere in str. - do { - if (MatchRegexAtHead(regex, str)) - return true; - } while (*str++ != '\0'); - return false; -} - -// Implements the RE class. - -RE::~RE() { - free(const_cast(pattern_)); - free(const_cast(full_pattern_)); -} - -// Returns true iff regular expression re matches the entire str. -bool RE::FullMatch(const char* str, const RE& re) { - return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); -} - -// Returns true iff regular expression re matches a substring of str -// (including str itself). -bool RE::PartialMatch(const char* str, const RE& re) { - return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); -} - -// Initializes an RE from its string representation. -void RE::Init(const char* regex) { - pattern_ = full_pattern_ = NULL; - if (regex != NULL) { - pattern_ = posix::StrDup(regex); - } - - is_valid_ = ValidateRegex(regex); - if (!is_valid_) { - // No need to calculate the full pattern when the regex is invalid. - return; - } - - const size_t len = strlen(regex); - // Reserves enough bytes to hold the regular expression used for a - // full match: we need space to prepend a '^', append a '$', and - // terminate the string with '\0'. - char* buffer = static_cast(malloc(len + 3)); - full_pattern_ = buffer; - - if (*regex != '^') - *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'. - - // We don't use snprintf or strncpy, as they trigger a warning when - // compiled with VC++ 8.0. - memcpy(buffer, regex, len); - buffer += len; - - if (len == 0 || regex[len - 1] != '$') - *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'. - - *buffer = '\0'; -} - -#endif // GTEST_USES_POSIX_RE - -const char kUnknownFile[] = "unknown file"; - -// Formats a source file path and a line number as they would appear -// in an error message from the compiler used to compile this code. -GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { - const std::string file_name(file == NULL ? kUnknownFile : file); - - if (line < 0) { - return file_name + ":"; - } -#ifdef _MSC_VER - return file_name + "(" + StreamableToString(line) + "):"; -#else - return file_name + ":" + StreamableToString(line) + ":"; -#endif // _MSC_VER -} - -// Formats a file location for compiler-independent XML output. -// Although this function is not platform dependent, we put it next to -// FormatFileLocation in order to contrast the two functions. -// Note that FormatCompilerIndependentFileLocation() does NOT append colon -// to the file location it produces, unlike FormatFileLocation(). -GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( - const char* file, int line) { - const std::string file_name(file == NULL ? kUnknownFile : file); - - if (line < 0) - return file_name; - else - return file_name + ":" + StreamableToString(line); -} - - -GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) - : severity_(severity) { - const char* const marker = - severity == GTEST_INFO ? "[ INFO ]" : - severity == GTEST_WARNING ? "[WARNING]" : - severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]"; - GetStream() << ::std::endl << marker << " " - << FormatFileLocation(file, line).c_str() << ": "; -} - -// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. -GTestLog::~GTestLog() { - GetStream() << ::std::endl; - if (severity_ == GTEST_FATAL) { - fflush(stderr); - posix::Abort(); - } -} -// Disable Microsoft deprecation warnings for POSIX functions called from -// this class (creat, dup, dup2, and close) -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable: 4996) -#endif // _MSC_VER - -#if GTEST_HAS_STREAM_REDIRECTION - -// Object that captures an output stream (stdout/stderr). -class CapturedStream { - public: - // The ctor redirects the stream to a temporary file. - explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { -# if GTEST_OS_WINDOWS - char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT - char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT - - ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); - const UINT success = ::GetTempFileNameA(temp_dir_path, - "gtest_redir", - 0, // Generate unique file name. - temp_file_path); - GTEST_CHECK_(success != 0) - << "Unable to create a temporary file in " << temp_dir_path; - const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); - GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file " - << temp_file_path; - filename_ = temp_file_path; -# else - // There's no guarantee that a test has write access to the current - // directory, so we create the temporary file in the /tmp directory - // instead. We use /tmp on most systems, and /sdcard on Android. - // That's because Android doesn't have /tmp. -# if GTEST_OS_LINUX_ANDROID - // Note: Android applications are expected to call the framework's - // Context.getExternalStorageDirectory() method through JNI to get - // the location of the world-writable SD Card directory. However, - // this requires a Context handle, which cannot be retrieved - // globally from native code. Doing so also precludes running the - // code as part of a regular standalone executable, which doesn't - // run in a Dalvik process (e.g. when running it through 'adb shell'). - // - // The location /sdcard is directly accessible from native code - // and is the only location (unofficially) supported by the Android - // team. It's generally a symlink to the real SD Card mount point - // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or - // other OEM-customized locations. Never rely on these, and always - // use /sdcard. - char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX"; -# else - char name_template[] = "/tmp/captured_stream.XXXXXX"; -# endif // GTEST_OS_LINUX_ANDROID - const int captured_fd = mkstemp(name_template); - filename_ = name_template; -# endif // GTEST_OS_WINDOWS - fflush(NULL); - dup2(captured_fd, fd_); - close(captured_fd); - } - - ~CapturedStream() { - remove(filename_.c_str()); - } - - std::string GetCapturedString() { - if (uncaptured_fd_ != -1) { - // Restores the original stream. - fflush(NULL); - dup2(uncaptured_fd_, fd_); - close(uncaptured_fd_); - uncaptured_fd_ = -1; - } - - FILE* const file = posix::FOpen(filename_.c_str(), "r"); - const std::string content = ReadEntireFile(file); - posix::FClose(file); - return content; - } - - private: - // Reads the entire content of a file as an std::string. - static std::string ReadEntireFile(FILE* file); - - // Returns the size (in bytes) of a file. - static size_t GetFileSize(FILE* file); - - const int fd_; // A stream to capture. - int uncaptured_fd_; - // Name of the temporary file holding the stderr output. - ::std::string filename_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); -}; - -// Returns the size (in bytes) of a file. -size_t CapturedStream::GetFileSize(FILE* file) { - fseek(file, 0, SEEK_END); - return static_cast(ftell(file)); -} - -// Reads the entire content of a file as a string. -std::string CapturedStream::ReadEntireFile(FILE* file) { - const size_t file_size = GetFileSize(file); - char* const buffer = new char[file_size]; - - size_t bytes_last_read = 0; // # of bytes read in the last fread() - size_t bytes_read = 0; // # of bytes read so far - - fseek(file, 0, SEEK_SET); - - // Keeps reading the file until we cannot read further or the - // pre-determined file size is reached. - do { - bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file); - bytes_read += bytes_last_read; - } while (bytes_last_read > 0 && bytes_read < file_size); - - const std::string content(buffer, bytes_read); - delete[] buffer; - - return content; -} - -# ifdef _MSC_VER -# pragma warning(pop) -# endif // _MSC_VER - -static CapturedStream* g_captured_stderr = NULL; -static CapturedStream* g_captured_stdout = NULL; - -// Starts capturing an output stream (stdout/stderr). -void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) { - if (*stream != NULL) { - GTEST_LOG_(FATAL) << "Only one " << stream_name - << " capturer can exist at a time."; - } - *stream = new CapturedStream(fd); -} - -// Stops capturing the output stream and returns the captured string. -std::string GetCapturedStream(CapturedStream** captured_stream) { - const std::string content = (*captured_stream)->GetCapturedString(); - - delete *captured_stream; - *captured_stream = NULL; - - return content; -} - -// Starts capturing stdout. -void CaptureStdout() { - CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); -} - -// Starts capturing stderr. -void CaptureStderr() { - CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr); -} - -// Stops capturing stdout and returns the captured string. -std::string GetCapturedStdout() { - return GetCapturedStream(&g_captured_stdout); -} - -// Stops capturing stderr and returns the captured string. -std::string GetCapturedStderr() { - return GetCapturedStream(&g_captured_stderr); -} - -#endif // GTEST_HAS_STREAM_REDIRECTION - -#if GTEST_HAS_DEATH_TEST - -// A copy of all command line arguments. Set by InitGoogleTest(). -::std::vector g_argvs; - -static const ::std::vector* g_injected_test_argvs = - NULL; // Owned. - -void SetInjectableArgvs(const ::std::vector* argvs) { - if (g_injected_test_argvs != argvs) - delete g_injected_test_argvs; - g_injected_test_argvs = argvs; -} - -const ::std::vector& GetInjectableArgvs() { - if (g_injected_test_argvs != NULL) { - return *g_injected_test_argvs; - } - return g_argvs; -} -#endif // GTEST_HAS_DEATH_TEST - -#if GTEST_OS_WINDOWS_MOBILE -namespace posix { -void Abort() { - DebugBreak(); - TerminateProcess(GetCurrentProcess(), 1); -} -} // namespace posix -#endif // GTEST_OS_WINDOWS_MOBILE - -// Returns the name of the environment variable corresponding to the -// given flag. For example, FlagToEnvVar("foo") will return -// "GTEST_FOO" in the open-source version. -static std::string FlagToEnvVar(const char* flag) { - const std::string full_flag = - (Message() << GTEST_FLAG_PREFIX_ << flag).GetString(); - - Message env_var; - for (size_t i = 0; i != full_flag.length(); i++) { - env_var << ToUpper(full_flag.c_str()[i]); - } - - return env_var.GetString(); -} - -// Parses 'str' for a 32-bit signed integer. If successful, writes -// the result to *value and returns true; otherwise leaves *value -// unchanged and returns false. -bool ParseInt32(const Message& src_text, const char* str, Int32* value) { - // Parses the environment variable as a decimal integer. - char* end = NULL; - const long long_value = strtol(str, &end, 10); // NOLINT - - // Has strtol() consumed all characters in the string? - if (*end != '\0') { - // No - an invalid character was encountered. - Message msg; - msg << "WARNING: " << src_text - << " is expected to be a 32-bit integer, but actually" - << " has value \"" << str << "\".\n"; - printf("%s", msg.GetString().c_str()); - fflush(stdout); - return false; - } - - // Is the parsed value in the range of an Int32? - const Int32 result = static_cast(long_value); - if (long_value == LONG_MAX || long_value == LONG_MIN || - // The parsed value overflows as a long. (strtol() returns - // LONG_MAX or LONG_MIN when the input overflows.) - result != long_value - // The parsed value overflows as an Int32. - ) { - Message msg; - msg << "WARNING: " << src_text - << " is expected to be a 32-bit integer, but actually" - << " has value " << str << ", which overflows.\n"; - printf("%s", msg.GetString().c_str()); - fflush(stdout); - return false; - } - - *value = result; - return true; -} - -// Reads and returns the Boolean environment variable corresponding to -// the given flag; if it's not set, returns default_value. -// -// The value is considered true iff it's not "0". -bool BoolFromGTestEnv(const char* flag, bool default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const string_value = posix::GetEnv(env_var.c_str()); - return string_value == NULL ? - default_value : strcmp(string_value, "0") != 0; -} - -// Reads and returns a 32-bit integer stored in the environment -// variable corresponding to the given flag; if it isn't set or -// doesn't represent a valid 32-bit integer, returns default_value. -Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const string_value = posix::GetEnv(env_var.c_str()); - if (string_value == NULL) { - // The environment variable is not set. - return default_value; - } - - Int32 result = default_value; - if (!ParseInt32(Message() << "Environment variable " << env_var, - string_value, &result)) { - printf("The default value %s is used.\n", - (Message() << default_value).GetString().c_str()); - fflush(stdout); - return default_value; - } - - return result; -} - -// Reads and returns the string environment variable corresponding to -// the given flag; if it's not set, returns default_value. -const char* StringFromGTestEnv(const char* flag, const char* default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const value = posix::GetEnv(env_var.c_str()); - return value == NULL ? default_value : value; -} - -} // namespace internal -} // namespace testing -// Copyright 2007, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -// Google Test - The Google C++ Testing Framework -// -// This file implements a universal value printer that can print a -// value of any type T: -// -// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); -// -// It uses the << operator when possible, and prints the bytes in the -// object otherwise. A user can override its behavior for a class -// type Foo by defining either operator<<(::std::ostream&, const Foo&) -// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that -// defines Foo. - -#include -#include -#include // NOLINT -#include - -namespace testing { - -namespace { - -using ::std::ostream; - -// Prints a segment of bytes in the given object. -void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start, - size_t count, ostream* os) { - char text[5] = ""; - for (size_t i = 0; i != count; i++) { - const size_t j = start + i; - if (i != 0) { - // Organizes the bytes into groups of 2 for easy parsing by - // human. - if ((j % 2) == 0) - *os << ' '; - else - *os << '-'; - } - GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]); - *os << text; - } -} - -// Prints the bytes in the given value to the given ostream. -void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, - ostream* os) { - // Tells the user how big the object is. - *os << count << "-byte object <"; - - const size_t kThreshold = 132; - const size_t kChunkSize = 64; - // If the object size is bigger than kThreshold, we'll have to omit - // some details by printing only the first and the last kChunkSize - // bytes. - // TODO(wan): let the user control the threshold using a flag. - if (count < kThreshold) { - PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); - } else { - PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); - *os << " ... "; - // Rounds up to 2-byte boundary. - const size_t resume_pos = (count - kChunkSize + 1)/2*2; - PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); - } - *os << ">"; -} - -} // namespace - -namespace internal2 { - -// Delegates to PrintBytesInObjectToImpl() to print the bytes in the -// given object. The delegation simplifies the implementation, which -// uses the << operator and thus is easier done outside of the -// ::testing::internal namespace, which contains a << operator that -// sometimes conflicts with the one in STL. -void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, - ostream* os) { - PrintBytesInObjectToImpl(obj_bytes, count, os); -} - -} // namespace internal2 - -namespace internal { - -// Depending on the value of a char (or wchar_t), we print it in one -// of three formats: -// - as is if it's a printable ASCII (e.g. 'a', '2', ' '), -// - as a hexidecimal escape sequence (e.g. '\x7F'), or -// - as a special escape sequence (e.g. '\r', '\n'). -enum CharFormat { - kAsIs, - kHexEscape, - kSpecialEscape -}; - -// Returns true if c is a printable ASCII character. We test the -// value of c directly instead of calling isprint(), which is buggy on -// Windows Mobile. -inline bool IsPrintableAscii(wchar_t c) { - return 0x20 <= c && c <= 0x7E; -} - -// Prints a wide or narrow char c as a character literal without the -// quotes, escaping it when necessary; returns how c was formatted. -// The template argument UnsignedChar is the unsigned version of Char, -// which is the type of c. -template -static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { - switch (static_cast(c)) { - case L'\0': - *os << "\\0"; - break; - case L'\'': - *os << "\\'"; - break; - case L'\\': - *os << "\\\\"; - break; - case L'\a': - *os << "\\a"; - break; - case L'\b': - *os << "\\b"; - break; - case L'\f': - *os << "\\f"; - break; - case L'\n': - *os << "\\n"; - break; - case L'\r': - *os << "\\r"; - break; - case L'\t': - *os << "\\t"; - break; - case L'\v': - *os << "\\v"; - break; - default: - if (IsPrintableAscii(c)) { - *os << static_cast(c); - return kAsIs; - } else { - *os << "\\x" + String::FormatHexInt(static_cast(c)); - return kHexEscape; - } - } - return kSpecialEscape; -} - -// Prints a wchar_t c as if it's part of a string literal, escaping it when -// necessary; returns how c was formatted. -static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { - switch (c) { - case L'\'': - *os << "'"; - return kAsIs; - case L'"': - *os << "\\\""; - return kSpecialEscape; - default: - return PrintAsCharLiteralTo(c, os); - } -} - -// Prints a char c as if it's part of a string literal, escaping it when -// necessary; returns how c was formatted. -static CharFormat PrintAsStringLiteralTo(char c, ostream* os) { - return PrintAsStringLiteralTo( - static_cast(static_cast(c)), os); -} - -// Prints a wide or narrow character c and its code. '\0' is printed -// as "'\\0'", other unprintable characters are also properly escaped -// using the standard C++ escape sequence. The template argument -// UnsignedChar is the unsigned version of Char, which is the type of c. -template -void PrintCharAndCodeTo(Char c, ostream* os) { - // First, print c as a literal in the most readable form we can find. - *os << ((sizeof(c) > 1) ? "L'" : "'"); - const CharFormat format = PrintAsCharLiteralTo(c, os); - *os << "'"; - - // To aid user debugging, we also print c's code in decimal, unless - // it's 0 (in which case c was printed as '\\0', making the code - // obvious). - if (c == 0) - return; - *os << " (" << static_cast(c); - - // For more convenience, we print c's code again in hexidecimal, - // unless c was already printed in the form '\x##' or the code is in - // [1, 9]. - if (format == kHexEscape || (1 <= c && c <= 9)) { - // Do nothing. - } else { - *os << ", 0x" << String::FormatHexInt(static_cast(c)); - } - *os << ")"; -} - -void PrintTo(unsigned char c, ::std::ostream* os) { - PrintCharAndCodeTo(c, os); -} -void PrintTo(signed char c, ::std::ostream* os) { - PrintCharAndCodeTo(c, os); -} - -// Prints a wchar_t as a symbol if it is printable or as its internal -// code otherwise and also as its code. L'\0' is printed as "L'\\0'". -void PrintTo(wchar_t wc, ostream* os) { - PrintCharAndCodeTo(wc, os); -} - -// Prints the given array of characters to the ostream. CharType must be either -// char or wchar_t. -// The array starts at begin, the length is len, it may include '\0' characters -// and may not be NUL-terminated. -template -static void PrintCharsAsStringTo( - const CharType* begin, size_t len, ostream* os) { - const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\""; - *os << kQuoteBegin; - bool is_previous_hex = false; - for (size_t index = 0; index < len; ++index) { - const CharType cur = begin[index]; - if (is_previous_hex && IsXDigit(cur)) { - // Previous character is of '\x..' form and this character can be - // interpreted as another hexadecimal digit in its number. Break string to - // disambiguate. - *os << "\" " << kQuoteBegin; - } - is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; - } - *os << "\""; -} - -// Prints a (const) char/wchar_t array of 'len' elements, starting at address -// 'begin'. CharType must be either char or wchar_t. -template -static void UniversalPrintCharArray( - const CharType* begin, size_t len, ostream* os) { - // The code - // const char kFoo[] = "foo"; - // generates an array of 4, not 3, elements, with the last one being '\0'. - // - // Therefore when printing a char array, we don't print the last element if - // it's '\0', such that the output matches the string literal as it's - // written in the source code. - if (len > 0 && begin[len - 1] == '\0') { - PrintCharsAsStringTo(begin, len - 1, os); - return; - } - - // If, however, the last element in the array is not '\0', e.g. - // const char kFoo[] = { 'f', 'o', 'o' }; - // we must print the entire array. We also print a message to indicate - // that the array is not NUL-terminated. - PrintCharsAsStringTo(begin, len, os); - *os << " (no terminating NUL)"; -} - -// Prints a (const) char array of 'len' elements, starting at address 'begin'. -void UniversalPrintArray(const char* begin, size_t len, ostream* os) { - UniversalPrintCharArray(begin, len, os); -} - -// Prints a (const) wchar_t array of 'len' elements, starting at address -// 'begin'. -void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) { - UniversalPrintCharArray(begin, len, os); -} - -// Prints the given C string to the ostream. -void PrintTo(const char* s, ostream* os) { - if (s == NULL) { - *os << "NULL"; - } else { - *os << ImplicitCast_(s) << " pointing to "; - PrintCharsAsStringTo(s, strlen(s), os); - } -} - -// MSVC compiler can be configured to define whar_t as a typedef -// of unsigned short. Defining an overload for const wchar_t* in that case -// would cause pointers to unsigned shorts be printed as wide strings, -// possibly accessing more memory than intended and causing invalid -// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when -// wchar_t is implemented as a native type. -#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) -// Prints the given wide C string to the ostream. -void PrintTo(const wchar_t* s, ostream* os) { - if (s == NULL) { - *os << "NULL"; - } else { - *os << ImplicitCast_(s) << " pointing to "; - PrintCharsAsStringTo(s, wcslen(s), os); - } -} -#endif // wchar_t is native - -// Prints a ::string object. -#if GTEST_HAS_GLOBAL_STRING -void PrintStringTo(const ::string& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} -#endif // GTEST_HAS_GLOBAL_STRING - -void PrintStringTo(const ::std::string& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} - -// Prints a ::wstring object. -#if GTEST_HAS_GLOBAL_WSTRING -void PrintWideStringTo(const ::wstring& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} -#endif // GTEST_HAS_GLOBAL_WSTRING - -#if GTEST_HAS_STD_WSTRING -void PrintWideStringTo(const ::std::wstring& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} -#endif // GTEST_HAS_STD_WSTRING - -} // namespace internal - -} // namespace testing -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: mheule@google.com (Markus Heule) -// -// The Google C++ Testing Framework (Google Test) - - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 -#undef GTEST_IMPLEMENTATION_ - -namespace testing { - -using internal::GetUnitTestImpl; - -// Gets the summary of the failure message by omitting the stack trace -// in it. -std::string TestPartResult::ExtractSummary(const char* message) { - const char* const stack_trace = strstr(message, internal::kStackTraceMarker); - return stack_trace == NULL ? message : - std::string(message, stack_trace); -} - -// Prints a TestPartResult object. -std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { - return os - << result.file_name() << ":" << result.line_number() << ": " - << (result.type() == TestPartResult::kSuccess ? "Success" : - result.type() == TestPartResult::kFatalFailure ? "Fatal failure" : - "Non-fatal failure") << ":\n" - << result.message() << std::endl; -} - -// Appends a TestPartResult to the array. -void TestPartResultArray::Append(const TestPartResult& result) { - array_.push_back(result); -} - -// Returns the TestPartResult at the given index (0-based). -const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const { - if (index < 0 || index >= size()) { - printf("\nInvalid index (%d) into TestPartResultArray.\n", index); - internal::posix::Abort(); - } - - return array_[index]; -} - -// Returns the number of TestPartResult objects in the array. -int TestPartResultArray::size() const { - return static_cast(array_.size()); -} - -namespace internal { - -HasNewFatalFailureHelper::HasNewFatalFailureHelper() - : has_new_fatal_failure_(false), - original_reporter_(GetUnitTestImpl()-> - GetTestPartResultReporterForCurrentThread()) { - GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); -} - -HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { - GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread( - original_reporter_); -} - -void HasNewFatalFailureHelper::ReportTestPartResult( - const TestPartResult& result) { - if (result.fatally_failed()) - has_new_fatal_failure_ = true; - original_reporter_->ReportTestPartResult(result); -} - -} // namespace internal - -} // namespace testing -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - - -namespace testing { -namespace internal { - -#if GTEST_HAS_TYPED_TEST_P - -// Skips to the first non-space char in str. Returns an empty string if str -// contains only whitespace characters. -static const char* SkipSpaces(const char* str) { - while (IsSpace(*str)) - str++; - return str; -} - -// Verifies that registered_tests match the test names in -// defined_test_names_; returns registered_tests if successful, or -// aborts the program otherwise. -const char* TypedTestCasePState::VerifyRegisteredTestNames( - const char* file, int line, const char* registered_tests) { - typedef ::std::set::const_iterator DefinedTestIter; - registered_ = true; - - // Skip initial whitespace in registered_tests since some - // preprocessors prefix stringizied literals with whitespace. - registered_tests = SkipSpaces(registered_tests); - - Message errors; - ::std::set tests; - for (const char* names = registered_tests; names != NULL; - names = SkipComma(names)) { - const std::string name = GetPrefixUntilComma(names); - if (tests.count(name) != 0) { - errors << "Test " << name << " is listed more than once.\n"; - continue; - } - - bool found = false; - for (DefinedTestIter it = defined_test_names_.begin(); - it != defined_test_names_.end(); - ++it) { - if (name == *it) { - found = true; - break; - } - } - - if (found) { - tests.insert(name); - } else { - errors << "No test named " << name - << " can be found in this test case.\n"; - } - } - - for (DefinedTestIter it = defined_test_names_.begin(); - it != defined_test_names_.end(); - ++it) { - if (tests.count(*it) == 0) { - errors << "You forgot to list test " << *it << ".\n"; - } - } - - const std::string& errors_str = errors.GetString(); - if (errors_str != "") { - fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), - errors_str.c_str()); - fflush(stderr); - posix::Abort(); - } - - return registered_tests; -} - -#endif // GTEST_HAS_TYPED_TEST_P - -} // namespace internal -} // namespace testing +#include "src/gtest.cc" +#include "src/gtest-death-test.cc" +#include "src/gtest-filepath.cc" +#include "src/gtest-port.cc" +#include "src/gtest-printers.cc" +#include "src/gtest-test-part.cc" +#include "src/gtest-typed-test.cc" diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc new file mode 100644 index 0000000000..a01a369830 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc @@ -0,0 +1,1342 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev) +// +// This file implements death tests. + +#include "gtest/gtest-death-test.h" +#include "gtest/internal/gtest-port.h" +#include "gtest/internal/custom/gtest.h" + +#if GTEST_HAS_DEATH_TEST + +# if GTEST_OS_MAC +# include +# endif // GTEST_OS_MAC + +# include +# include +# include + +# if GTEST_OS_LINUX +# include +# endif // GTEST_OS_LINUX + +# include + +# if GTEST_OS_WINDOWS +# include +# else +# include +# include +# endif // GTEST_OS_WINDOWS + +# if GTEST_OS_QNX +# include +# endif // GTEST_OS_QNX + +#endif // GTEST_HAS_DEATH_TEST + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-string.h" + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick exists to +// prevent the accidental inclusion of gtest-internal-inl.h in the +// user's code. +#define GTEST_IMPLEMENTATION_ 1 +#include "src/gtest-internal-inl.h" +#undef GTEST_IMPLEMENTATION_ + +namespace testing { + +// Constants. + +// The default death test style. +static const char kDefaultDeathTestStyle[] = "fast"; + +GTEST_DEFINE_string_( + death_test_style, + internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle), + "Indicates how to run a death test in a forked child process: " + "\"threadsafe\" (child process re-executes the test binary " + "from the beginning, running only the specific death test) or " + "\"fast\" (child process runs the death test immediately " + "after forking)."); + +GTEST_DEFINE_bool_( + death_test_use_fork, + internal::BoolFromGTestEnv("death_test_use_fork", false), + "Instructs to use fork()/_exit() instead of clone() in death tests. " + "Ignored and always uses fork() on POSIX systems where clone() is not " + "implemented. Useful when running under valgrind or similar tools if " + "those do not support clone(). Valgrind 3.3.1 will just fail if " + "it sees an unsupported combination of clone() flags. " + "It is not recommended to use this flag w/o valgrind though it will " + "work in 99% of the cases. Once valgrind is fixed, this flag will " + "most likely be removed."); + +namespace internal { +GTEST_DEFINE_string_( + internal_run_death_test, "", + "Indicates the file, line number, temporal index of " + "the single death test to run, and a file descriptor to " + "which a success code may be sent, all separated by " + "the '|' characters. This flag is specified if and only if the current " + "process is a sub-process launched for running a thread-safe " + "death test. FOR INTERNAL USE ONLY."); +} // namespace internal + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Valid only for fast death tests. Indicates the code is running in the +// child process of a fast style death test. +# if !GTEST_OS_WINDOWS +static bool g_in_fast_death_test_child = false; +# endif + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +bool InDeathTestChild() { +# if GTEST_OS_WINDOWS + + // On Windows, death tests are thread-safe regardless of the value of the + // death_test_style flag. + return !GTEST_FLAG(internal_run_death_test).empty(); + +# else + + if (GTEST_FLAG(death_test_style) == "threadsafe") + return !GTEST_FLAG(internal_run_death_test).empty(); + else + return g_in_fast_death_test_child; +#endif +} + +} // namespace internal + +// ExitedWithCode constructor. +ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) { +} + +// ExitedWithCode function-call operator. +bool ExitedWithCode::operator()(int exit_status) const { +# if GTEST_OS_WINDOWS + + return exit_status == exit_code_; + +# else + + return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; + +# endif // GTEST_OS_WINDOWS +} + +# if !GTEST_OS_WINDOWS +// KilledBySignal constructor. +KilledBySignal::KilledBySignal(int signum) : signum_(signum) { +} + +// KilledBySignal function-call operator. +bool KilledBySignal::operator()(int exit_status) const { +# if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) + { + bool result; + if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) { + return result; + } + } +# endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) + return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; +} +# endif // !GTEST_OS_WINDOWS + +namespace internal { + +// Utilities needed for death tests. + +// Generates a textual description of a given exit code, in the format +// specified by wait(2). +static std::string ExitSummary(int exit_code) { + Message m; + +# if GTEST_OS_WINDOWS + + m << "Exited with exit status " << exit_code; + +# else + + if (WIFEXITED(exit_code)) { + m << "Exited with exit status " << WEXITSTATUS(exit_code); + } else if (WIFSIGNALED(exit_code)) { + m << "Terminated by signal " << WTERMSIG(exit_code); + } +# ifdef WCOREDUMP + if (WCOREDUMP(exit_code)) { + m << " (core dumped)"; + } +# endif +# endif // GTEST_OS_WINDOWS + + return m.GetString(); +} + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +bool ExitedUnsuccessfully(int exit_status) { + return !ExitedWithCode(0)(exit_status); +} + +# if !GTEST_OS_WINDOWS +// Generates a textual failure message when a death test finds more than +// one thread running, or cannot determine the number of threads, prior +// to executing the given statement. It is the responsibility of the +// caller not to pass a thread_count of 1. +static std::string DeathTestThreadWarning(size_t thread_count) { + Message msg; + msg << "Death tests use fork(), which is unsafe particularly" + << " in a threaded context. For this test, " << GTEST_NAME_ << " "; + if (thread_count == 0) + msg << "couldn't detect the number of threads."; + else + msg << "detected " << thread_count << " threads."; + return msg.GetString(); +} +# endif // !GTEST_OS_WINDOWS + +// Flag characters for reporting a death test that did not die. +static const char kDeathTestLived = 'L'; +static const char kDeathTestReturned = 'R'; +static const char kDeathTestThrew = 'T'; +static const char kDeathTestInternalError = 'I'; + +// An enumeration describing all of the possible ways that a death test can +// conclude. DIED means that the process died while executing the test +// code; LIVED means that process lived beyond the end of the test code; +// RETURNED means that the test statement attempted to execute a return +// statement, which is not allowed; THREW means that the test statement +// returned control by throwing an exception. IN_PROGRESS means the test +// has not yet concluded. +// TODO(vladl@google.com): Unify names and possibly values for +// AbortReason, DeathTestOutcome, and flag characters above. +enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; + +// Routine for aborting the program which is safe to call from an +// exec-style death test child process, in which case the error +// message is propagated back to the parent process. Otherwise, the +// message is simply printed to stderr. In either case, the program +// then exits with status 1. +void DeathTestAbort(const std::string& message) { + // On a POSIX system, this function may be called from a threadsafe-style + // death test child process, which operates on a very small stack. Use + // the heap for any additional non-minuscule memory requirements. + const InternalRunDeathTestFlag* const flag = + GetUnitTestImpl()->internal_run_death_test_flag(); + if (flag != NULL) { + FILE* parent = posix::FDOpen(flag->write_fd(), "w"); + fputc(kDeathTestInternalError, parent); + fprintf(parent, "%s", message.c_str()); + fflush(parent); + _exit(1); + } else { + fprintf(stderr, "%s", message.c_str()); + fflush(stderr); + posix::Abort(); + } +} + +// A replacement for CHECK that calls DeathTestAbort if the assertion +// fails. +# define GTEST_DEATH_TEST_CHECK_(expression) \ + do { \ + if (!::testing::internal::IsTrue(expression)) { \ + DeathTestAbort( \ + ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ + + ::testing::internal::StreamableToString(__LINE__) + ": " \ + + #expression); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for +// evaluating any system call that fulfills two conditions: it must return +// -1 on failure, and set errno to EINTR when it is interrupted and +// should be tried again. The macro expands to a loop that repeatedly +// evaluates the expression as long as it evaluates to -1 and sets +// errno to EINTR. If the expression evaluates to -1 but errno is +// something other than EINTR, DeathTestAbort is called. +# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ + do { \ + int gtest_retval; \ + do { \ + gtest_retval = (expression); \ + } while (gtest_retval == -1 && errno == EINTR); \ + if (gtest_retval == -1) { \ + DeathTestAbort( \ + ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ + + ::testing::internal::StreamableToString(__LINE__) + ": " \ + + #expression + " != -1"); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// Returns the message describing the last system error in errno. +std::string GetLastErrnoDescription() { + return errno == 0 ? "" : posix::StrError(errno); +} + +// This is called from a death test parent process to read a failure +// message from the death test child process and log it with the FATAL +// severity. On Windows, the message is read from a pipe handle. On other +// platforms, it is read from a file descriptor. +static void FailFromInternalError(int fd) { + Message error; + char buffer[256]; + int num_read; + + do { + while ((num_read = posix::Read(fd, buffer, 255)) > 0) { + buffer[num_read] = '\0'; + error << buffer; + } + } while (num_read == -1 && errno == EINTR); + + if (num_read == 0) { + GTEST_LOG_(FATAL) << error.GetString(); + } else { + const int last_error = errno; + GTEST_LOG_(FATAL) << "Error while reading death test internal: " + << GetLastErrnoDescription() << " [" << last_error << "]"; + } +} + +// Death test constructor. Increments the running death test count +// for the current test. +DeathTest::DeathTest() { + TestInfo* const info = GetUnitTestImpl()->current_test_info(); + if (info == NULL) { + DeathTestAbort("Cannot run a death test outside of a TEST or " + "TEST_F construct"); + } +} + +// Creates and returns a death test by dispatching to the current +// death test factory. +bool DeathTest::Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test) { + return GetUnitTestImpl()->death_test_factory()->Create( + statement, regex, file, line, test); +} + +const char* DeathTest::LastMessage() { + return last_death_test_message_.c_str(); +} + +void DeathTest::set_last_death_test_message(const std::string& message) { + last_death_test_message_ = message; +} + +std::string DeathTest::last_death_test_message_; + +// Provides cross platform implementation for some death functionality. +class DeathTestImpl : public DeathTest { + protected: + DeathTestImpl(const char* a_statement, const RE* a_regex) + : statement_(a_statement), + regex_(a_regex), + spawned_(false), + status_(-1), + outcome_(IN_PROGRESS), + read_fd_(-1), + write_fd_(-1) {} + + // read_fd_ is expected to be closed and cleared by a derived class. + ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } + + void Abort(AbortReason reason); + virtual bool Passed(bool status_ok); + + const char* statement() const { return statement_; } + const RE* regex() const { return regex_; } + bool spawned() const { return spawned_; } + void set_spawned(bool is_spawned) { spawned_ = is_spawned; } + int status() const { return status_; } + void set_status(int a_status) { status_ = a_status; } + DeathTestOutcome outcome() const { return outcome_; } + void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; } + int read_fd() const { return read_fd_; } + void set_read_fd(int fd) { read_fd_ = fd; } + int write_fd() const { return write_fd_; } + void set_write_fd(int fd) { write_fd_ = fd; } + + // Called in the parent process only. Reads the result code of the death + // test child process via a pipe, interprets it to set the outcome_ + // member, and closes read_fd_. Outputs diagnostics and terminates in + // case of unexpected codes. + void ReadAndInterpretStatusByte(); + + private: + // The textual content of the code this object is testing. This class + // doesn't own this string and should not attempt to delete it. + const char* const statement_; + // The regular expression which test output must match. DeathTestImpl + // doesn't own this object and should not attempt to delete it. + const RE* const regex_; + // True if the death test child process has been successfully spawned. + bool spawned_; + // The exit status of the child process. + int status_; + // How the death test concluded. + DeathTestOutcome outcome_; + // Descriptor to the read end of the pipe to the child process. It is + // always -1 in the child process. The child keeps its write end of the + // pipe in write_fd_. + int read_fd_; + // Descriptor to the child's write end of the pipe to the parent process. + // It is always -1 in the parent process. The parent keeps its end of the + // pipe in read_fd_. + int write_fd_; +}; + +// Called in the parent process only. Reads the result code of the death +// test child process via a pipe, interprets it to set the outcome_ +// member, and closes read_fd_. Outputs diagnostics and terminates in +// case of unexpected codes. +void DeathTestImpl::ReadAndInterpretStatusByte() { + char flag; + int bytes_read; + + // The read() here blocks until data is available (signifying the + // failure of the death test) or until the pipe is closed (signifying + // its success), so it's okay to call this in the parent before + // the child process has exited. + do { + bytes_read = posix::Read(read_fd(), &flag, 1); + } while (bytes_read == -1 && errno == EINTR); + + if (bytes_read == 0) { + set_outcome(DIED); + } else if (bytes_read == 1) { + switch (flag) { + case kDeathTestReturned: + set_outcome(RETURNED); + break; + case kDeathTestThrew: + set_outcome(THREW); + break; + case kDeathTestLived: + set_outcome(LIVED); + break; + case kDeathTestInternalError: + FailFromInternalError(read_fd()); // Does not return. + break; + default: + GTEST_LOG_(FATAL) << "Death test child process reported " + << "unexpected status byte (" + << static_cast(flag) << ")"; + } + } else { + GTEST_LOG_(FATAL) << "Read from death test child process failed: " + << GetLastErrnoDescription(); + } + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd())); + set_read_fd(-1); +} + +// Signals that the death test code which should have exited, didn't. +// Should be called only in a death test child process. +// Writes a status byte to the child's status file descriptor, then +// calls _exit(1). +void DeathTestImpl::Abort(AbortReason reason) { + // The parent process considers the death test to be a failure if + // it finds any data in our pipe. So, here we write a single flag byte + // to the pipe, then exit. + const char status_ch = + reason == TEST_DID_NOT_DIE ? kDeathTestLived : + reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned; + + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); + // We are leaking the descriptor here because on some platforms (i.e., + // when built as Windows DLL), destructors of global objects will still + // run after calling _exit(). On such systems, write_fd_ will be + // indirectly closed from the destructor of UnitTestImpl, causing double + // close if it is also closed here. On debug configurations, double close + // may assert. As there are no in-process buffers to flush here, we are + // relying on the OS to close the descriptor after the process terminates + // when the destructors are not run. + _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash) +} + +// Returns an indented copy of stderr output for a death test. +// This makes distinguishing death test output lines from regular log lines +// much easier. +static ::std::string FormatDeathTestOutput(const ::std::string& output) { + ::std::string ret; + for (size_t at = 0; ; ) { + const size_t line_end = output.find('\n', at); + ret += "[ DEATH ] "; + if (line_end == ::std::string::npos) { + ret += output.substr(at); + break; + } + ret += output.substr(at, line_end + 1 - at); + at = line_end + 1; + } + return ret; +} + +// Assesses the success or failure of a death test, using both private +// members which have previously been set, and one argument: +// +// Private data members: +// outcome: An enumeration describing how the death test +// concluded: DIED, LIVED, THREW, or RETURNED. The death test +// fails in the latter three cases. +// status: The exit status of the child process. On *nix, it is in the +// in the format specified by wait(2). On Windows, this is the +// value supplied to the ExitProcess() API or a numeric code +// of the exception that terminated the program. +// regex: A regular expression object to be applied to +// the test's captured standard error output; the death test +// fails if it does not match. +// +// Argument: +// status_ok: true if exit_status is acceptable in the context of +// this particular death test, which fails if it is false +// +// Returns true iff all of the above conditions are met. Otherwise, the +// first failing condition, in the order given above, is the one that is +// reported. Also sets the last death test message string. +bool DeathTestImpl::Passed(bool status_ok) { + if (!spawned()) + return false; + + const std::string error_message = GetCapturedStderr(); + + bool success = false; + Message buffer; + + buffer << "Death test: " << statement() << "\n"; + switch (outcome()) { + case LIVED: + buffer << " Result: failed to die.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case THREW: + buffer << " Result: threw an exception.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case RETURNED: + buffer << " Result: illegal return in test statement.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case DIED: + if (status_ok) { + const bool matched = RE::PartialMatch(error_message.c_str(), *regex()); + if (matched) { + success = true; + } else { + buffer << " Result: died but not with expected error.\n" + << " Expected: " << regex()->pattern() << "\n" + << "Actual msg:\n" << FormatDeathTestOutput(error_message); + } + } else { + buffer << " Result: died but not with expected exit code:\n" + << " " << ExitSummary(status()) << "\n" + << "Actual msg:\n" << FormatDeathTestOutput(error_message); + } + break; + case IN_PROGRESS: + default: + GTEST_LOG_(FATAL) + << "DeathTest::Passed somehow called before conclusion of test"; + } + + DeathTest::set_last_death_test_message(buffer.GetString()); + return success; +} + +# if GTEST_OS_WINDOWS +// WindowsDeathTest implements death tests on Windows. Due to the +// specifics of starting new processes on Windows, death tests there are +// always threadsafe, and Google Test considers the +// --gtest_death_test_style=fast setting to be equivalent to +// --gtest_death_test_style=threadsafe there. +// +// A few implementation notes: Like the Linux version, the Windows +// implementation uses pipes for child-to-parent communication. But due to +// the specifics of pipes on Windows, some extra steps are required: +// +// 1. The parent creates a communication pipe and stores handles to both +// ends of it. +// 2. The parent starts the child and provides it with the information +// necessary to acquire the handle to the write end of the pipe. +// 3. The child acquires the write end of the pipe and signals the parent +// using a Windows event. +// 4. Now the parent can release the write end of the pipe on its side. If +// this is done before step 3, the object's reference count goes down to +// 0 and it is destroyed, preventing the child from acquiring it. The +// parent now has to release it, or read operations on the read end of +// the pipe will not return when the child terminates. +// 5. The parent reads child's output through the pipe (outcome code and +// any possible error messages) from the pipe, and its stderr and then +// determines whether to fail the test. +// +// Note: to distinguish Win32 API calls from the local method and function +// calls, the former are explicitly resolved in the global namespace. +// +class WindowsDeathTest : public DeathTestImpl { + public: + WindowsDeathTest(const char* a_statement, + const RE* a_regex, + const char* file, + int line) + : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + virtual TestRole AssumeRole(); + + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; + // Handle to the write end of the pipe to the child process. + AutoHandle write_handle_; + // Child process handle. + AutoHandle child_handle_; + // Event the child process uses to signal the parent that it has + // acquired the handle to the write end of the pipe. After seeing this + // event the parent can release its own handles to make sure its + // ReadFile() calls return when the child terminates. + AutoHandle event_handle_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int WindowsDeathTest::Wait() { + if (!spawned()) + return 0; + + // Wait until the child either signals that it has acquired the write end + // of the pipe or it dies. + const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() }; + switch (::WaitForMultipleObjects(2, + wait_handles, + FALSE, // Waits for any of the handles. + INFINITE)) { + case WAIT_OBJECT_0: + case WAIT_OBJECT_0 + 1: + break; + default: + GTEST_DEATH_TEST_CHECK_(false); // Should not get here. + } + + // The child has acquired the write end of the pipe or exited. + // We release the handle on our side and continue. + write_handle_.Reset(); + event_handle_.Reset(); + + ReadAndInterpretStatusByte(); + + // Waits for the child process to exit if it haven't already. This + // returns immediately if the child has already exited, regardless of + // whether previous calls to WaitForMultipleObjects synchronized on this + // handle or not. + GTEST_DEATH_TEST_CHECK_( + WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(), + INFINITE)); + DWORD status_code; + GTEST_DEATH_TEST_CHECK_( + ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); + child_handle_.Reset(); + set_status(static_cast(status_code)); + return status(); +} + +// The AssumeRole process for a Windows death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole WindowsDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != NULL) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + // WindowsDeathTest uses an anonymous pipe to communicate results of + // a death test. + SECURITY_ATTRIBUTES handles_are_inheritable = { + sizeof(SECURITY_ATTRIBUTES), NULL, TRUE }; + HANDLE read_handle, write_handle; + GTEST_DEATH_TEST_CHECK_( + ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable, + 0) // Default buffer size. + != FALSE); + set_read_fd(::_open_osfhandle(reinterpret_cast(read_handle), + O_RDONLY)); + write_handle_.Reset(write_handle); + event_handle_.Reset(::CreateEvent( + &handles_are_inheritable, + TRUE, // The event will automatically reset to non-signaled state. + FALSE, // The initial state is non-signalled. + NULL)); // The even is unnamed. + GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL); + const std::string filter_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" + + info->test_case_name() + "." + info->name(); + const std::string internal_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + + "=" + file_ + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(static_cast(::GetCurrentProcessId())) + + // size_t has the same width as pointers on both 32-bit and 64-bit + // Windows platforms. + // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. + "|" + StreamableToString(reinterpret_cast(write_handle)) + + "|" + StreamableToString(reinterpret_cast(event_handle_.Get())); + + char executable_path[_MAX_PATH + 1]; // NOLINT + GTEST_DEATH_TEST_CHECK_( + _MAX_PATH + 1 != ::GetModuleFileNameA(NULL, + executable_path, + _MAX_PATH)); + + std::string command_line = + std::string(::GetCommandLineA()) + " " + filter_flag + " \"" + + internal_flag + "\""; + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // The child process will share the standard handles with the parent. + STARTUPINFOA startup_info; + memset(&startup_info, 0, sizeof(STARTUPINFO)); + startup_info.dwFlags = STARTF_USESTDHANDLES; + startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE); + startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE); + startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); + + PROCESS_INFORMATION process_info; + GTEST_DEATH_TEST_CHECK_(::CreateProcessA( + executable_path, + const_cast(command_line.c_str()), + NULL, // Retuned process handle is not inheritable. + NULL, // Retuned thread handle is not inheritable. + TRUE, // Child inherits all inheritable handles (for write_handle_). + 0x0, // Default creation flags. + NULL, // Inherit the parent's environment. + UnitTest::GetInstance()->original_working_dir(), + &startup_info, + &process_info) != FALSE); + child_handle_.Reset(process_info.hProcess); + ::CloseHandle(process_info.hThread); + set_spawned(true); + return OVERSEE_TEST; +} +# else // We are not on Windows. + +// ForkingDeathTest provides implementations for most of the abstract +// methods of the DeathTest interface. Only the AssumeRole method is +// left undefined. +class ForkingDeathTest : public DeathTestImpl { + public: + ForkingDeathTest(const char* statement, const RE* regex); + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + + protected: + void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } + + private: + // PID of child process during death test; 0 in the child process itself. + pid_t child_pid_; +}; + +// Constructs a ForkingDeathTest. +ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex) + : DeathTestImpl(a_statement, a_regex), + child_pid_(-1) {} + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int ForkingDeathTest::Wait() { + if (!spawned()) + return 0; + + ReadAndInterpretStatusByte(); + + int status_value; + GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0)); + set_status(status_value); + return status_value; +} + +// A concrete death test class that forks, then immediately runs the test +// in the child process. +class NoExecDeathTest : public ForkingDeathTest { + public: + NoExecDeathTest(const char* a_statement, const RE* a_regex) : + ForkingDeathTest(a_statement, a_regex) { } + virtual TestRole AssumeRole(); +}; + +// The AssumeRole process for a fork-and-run death test. It implements a +// straightforward fork, with a simple pipe to transmit the status byte. +DeathTest::TestRole NoExecDeathTest::AssumeRole() { + const size_t thread_count = GetThreadCount(); + if (thread_count != 1) { + GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count); + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + + DeathTest::set_last_death_test_message(""); + CaptureStderr(); + // When we fork the process below, the log file buffers are copied, but the + // file descriptors are shared. We flush all log files here so that closing + // the file descriptors in the child process doesn't throw off the + // synchronization between descriptors and buffers in the parent process. + // This is as close to the fork as possible to avoid a race condition in case + // there are multiple threads running before the death test, and another + // thread writes to the log file. + FlushInfoLog(); + + const pid_t child_pid = fork(); + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + set_child_pid(child_pid); + if (child_pid == 0) { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0])); + set_write_fd(pipe_fd[1]); + // Redirects all logging to stderr in the child process to prevent + // concurrent writes to the log files. We capture stderr in the parent + // process and append the child process' output to a log. + LogToStderr(); + // Event forwarding to the listeners of event listener API mush be shut + // down in death test subprocesses. + GetUnitTestImpl()->listeners()->SuppressEventForwarding(); + g_in_fast_death_test_child = true; + return EXECUTE_TEST; + } else { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; + } +} + +// A concrete death test class that forks and re-executes the main +// program from the beginning, with command-line flags set that cause +// only this specific death test to be run. +class ExecDeathTest : public ForkingDeathTest { + public: + ExecDeathTest(const char* a_statement, const RE* a_regex, + const char* file, int line) : + ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { } + virtual TestRole AssumeRole(); + private: + static ::std::vector + GetArgvsForDeathTestChildProcess() { + ::std::vector args = GetInjectableArgvs(); +# if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) + ::std::vector extra_args = + GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_(); + args.insert(args.end(), extra_args.begin(), extra_args.end()); +# endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) + return args; + } + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { + args_.push_back(NULL); + } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char* argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector& arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); + ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char* const* Argv() { + return &args_[0]; + } + + private: + std::vector args_; +}; + +// A struct that encompasses the arguments to the child process of a +// threadsafe-style death test process. +struct ExecDeathTestArgs { + char* const* argv; // Command-line arguments for the child's call to exec + int close_fd; // File descriptor to close; the read end of a pipe +}; + +# if GTEST_OS_MAC +inline char** GetEnviron() { + // When Google Test is built as a framework on MacOS X, the environ variable + // is unavailable. Apple's documentation (man environ) recommends using + // _NSGetEnviron() instead. + return *_NSGetEnviron(); +} +# else +// Some POSIX platforms expect you to declare environ. extern "C" makes +// it reside in the global namespace. +extern "C" char** environ; +inline char** GetEnviron() { return environ; } +# endif // GTEST_OS_MAC + +# if !GTEST_OS_QNX +// The main function for a threadsafe-style death test child process. +// This function is called in a clone()-ed process and thus must avoid +// any potentially unsafe operations like malloc or libc functions. +static int ExecDeathTestChildMain(void* child_arg) { + ExecDeathTestArgs* const args = static_cast(child_arg); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd)); + + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + // We can safely call execve() as it's a direct system call. We + // cannot use execvp() as it's a libc function and thus potentially + // unsafe. Since execve() doesn't search the PATH, the user must + // invoke the test program via a valid path that contains at least + // one path separator. + execve(args->argv[0], args->argv, GetEnviron()); + DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " + + original_dir + " failed: " + + GetLastErrnoDescription()); + return EXIT_FAILURE; +} +# endif // !GTEST_OS_QNX + +// Two utility routines that together determine the direction the stack +// grows. +// This could be accomplished more elegantly by a single recursive +// function, but we want to guard against the unlikely possibility of +// a smart compiler optimizing the recursion away. +// +// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining +// StackLowerThanAddress into StackGrowsDown, which then doesn't give +// correct answer. +void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_; +void StackLowerThanAddress(const void* ptr, bool* result) { + int dummy; + *result = (&dummy < ptr); +} + +// Make sure AddressSanitizer does not tamper with the stack here. +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +bool StackGrowsDown() { + int dummy; + bool result; + StackLowerThanAddress(&dummy, &result); + return result; +} + +// Spawns a child process with the same executable as the current process in +// a thread-safe manner and instructs it to run the death test. The +// implementation uses fork(2) + exec. On systems where clone(2) is +// available, it is used instead, being slightly more thread-safe. On QNX, +// fork supports only single-threaded environments, so this function uses +// spawn(2) there instead. The function dies with an error message if +// anything goes wrong. +static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { + ExecDeathTestArgs args = { argv, close_fd }; + pid_t child_pid = -1; + +# if GTEST_OS_QNX + // Obtains the current directory and sets it to be closed in the child + // process. + const int cwd_fd = open(".", O_RDONLY); + GTEST_DEATH_TEST_CHECK_(cwd_fd != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC)); + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + int fd_flags; + // Set close_fd to be closed after spawn. + GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); + GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD, + fd_flags | FD_CLOEXEC)); + struct inheritance inherit = {0}; + // spawn is a system call. + child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron()); + // Restores the current working directory. + GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); + +# else // GTEST_OS_QNX +# if GTEST_OS_LINUX + // When a SIGPROF signal is received while fork() or clone() are executing, + // the process may hang. To avoid this, we ignore SIGPROF here and re-enable + // it after the call to fork()/clone() is complete. + struct sigaction saved_sigprof_action; + struct sigaction ignore_sigprof_action; + memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); + sigemptyset(&ignore_sigprof_action.sa_mask); + ignore_sigprof_action.sa_handler = SIG_IGN; + GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction( + SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); +# endif // GTEST_OS_LINUX + +# if GTEST_HAS_CLONE + const bool use_fork = GTEST_FLAG(death_test_use_fork); + + if (!use_fork) { + static const bool stack_grows_down = StackGrowsDown(); + const size_t stack_size = getpagesize(); + // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. + void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); + + // Maximum stack alignment in bytes: For a downward-growing stack, this + // amount is subtracted from size of the stack space to get an address + // that is within the stack space and is aligned on all systems we care + // about. As far as I know there is no ABI with stack alignment greater + // than 64. We assume stack and stack_size already have alignment of + // kMaxStackAlignment. + const size_t kMaxStackAlignment = 64; + void* const stack_top = + static_cast(stack) + + (stack_grows_down ? stack_size - kMaxStackAlignment : 0); + GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment && + reinterpret_cast(stack_top) % kMaxStackAlignment == 0); + + child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); + + GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); + } +# else + const bool use_fork = true; +# endif // GTEST_HAS_CLONE + + if (use_fork && (child_pid = fork()) == 0) { + ExecDeathTestChildMain(&args); + _exit(0); + } +# endif // GTEST_OS_QNX +# if GTEST_OS_LINUX + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &saved_sigprof_action, NULL)); +# endif // GTEST_OS_LINUX + + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + return child_pid; +} + +// The AssumeRole process for a fork-and-exec death test. It re-executes the +// main program from the beginning, setting the --gtest_filter +// and --gtest_internal_run_death_test flags to cause only the current +// death test to be re-run. +DeathTest::TestRole ExecDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != NULL) { + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + // Clear the close-on-exec flag on the write end of the pipe, lest + // it be closed when the child process does an exec: + GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); + + const std::string filter_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" + + info->test_case_name() + "." + info->name(); + const std::string internal_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" + + file_ + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(pipe_fd[1]); + Arguments args; + args.AddArguments(GetArgvsForDeathTestChildProcess()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // See the comment in NoExecDeathTest::AssumeRole for why the next line + // is necessary. + FlushInfoLog(); + + const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_child_pid(child_pid); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; +} + +# endif // !GTEST_OS_WINDOWS + +// Creates a concrete DeathTest-derived class that depends on the +// --gtest_death_test_style flag, and sets the pointer pointed to +// by the "test" argument to its address. If the test should be +// skipped, sets that pointer to NULL. Returns true, unless the +// flag is set to an invalid value. +bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, + const char* file, int line, + DeathTest** test) { + UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const int death_test_index = impl->current_test_info() + ->increment_death_test_count(); + + if (flag != NULL) { + if (death_test_index > flag->index()) { + DeathTest::set_last_death_test_message( + "Death test count (" + StreamableToString(death_test_index) + + ") somehow exceeded expected maximum (" + + StreamableToString(flag->index()) + ")"); + return false; + } + + if (!(flag->file() == file && flag->line() == line && + flag->index() == death_test_index)) { + *test = NULL; + return true; + } + } + +# if GTEST_OS_WINDOWS + + if (GTEST_FLAG(death_test_style) == "threadsafe" || + GTEST_FLAG(death_test_style) == "fast") { + *test = new WindowsDeathTest(statement, regex, file, line); + } + +# else + + if (GTEST_FLAG(death_test_style) == "threadsafe") { + *test = new ExecDeathTest(statement, regex, file, line); + } else if (GTEST_FLAG(death_test_style) == "fast") { + *test = new NoExecDeathTest(statement, regex); + } + +# endif // GTEST_OS_WINDOWS + + else { // NOLINT - this is more readable than unbalanced brackets inside #if. + DeathTest::set_last_death_test_message( + "Unknown death test style \"" + GTEST_FLAG(death_test_style) + + "\" encountered"); + return false; + } + + return true; +} + +# if GTEST_OS_WINDOWS +// Recreates the pipe and event handles from the provided parameters, +// signals the event, and returns a file descriptor wrapped around the pipe +// handle. This function is called in the child process only. +int GetStatusFileDescriptor(unsigned int parent_process_id, + size_t write_handle_as_size_t, + size_t event_handle_as_size_t) { + AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, + FALSE, // Non-inheritable. + parent_process_id)); + if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { + DeathTestAbort("Unable to open parent process " + + StreamableToString(parent_process_id)); + } + + // TODO(vladl@google.com): Replace the following check with a + // compile-time assertion when available. + GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); + + const HANDLE write_handle = + reinterpret_cast(write_handle_as_size_t); + HANDLE dup_write_handle; + + // The newly initialized handle is accessible only in in the parent + // process. To obtain one accessible within the child, we need to use + // DuplicateHandle. + if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, + ::GetCurrentProcess(), &dup_write_handle, + 0x0, // Requested privileges ignored since + // DUPLICATE_SAME_ACCESS is used. + FALSE, // Request non-inheritable handler. + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the pipe handle " + + StreamableToString(write_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const HANDLE event_handle = reinterpret_cast(event_handle_as_size_t); + HANDLE dup_event_handle; + + if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, + ::GetCurrentProcess(), &dup_event_handle, + 0x0, + FALSE, + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the event handle " + + StreamableToString(event_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const int write_fd = + ::_open_osfhandle(reinterpret_cast(dup_write_handle), O_APPEND); + if (write_fd == -1) { + DeathTestAbort("Unable to convert pipe handle " + + StreamableToString(write_handle_as_size_t) + + " to a file descriptor"); + } + + // Signals the parent that the write end of the pipe has been acquired + // so the parent can release its own write end. + ::SetEvent(dup_event_handle); + + return write_fd; +} +# endif // GTEST_OS_WINDOWS + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { + if (GTEST_FLAG(internal_run_death_test) == "") return NULL; + + // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we + // can use it here. + int line = -1; + int index = -1; + ::std::vector< ::std::string> fields; + SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields); + int write_fd = -1; + +# if GTEST_OS_WINDOWS + + unsigned int parent_process_id = 0; + size_t write_handle_as_size_t = 0; + size_t event_handle_as_size_t = 0; + + if (fields.size() != 6 + || !ParseNaturalNumber(fields[1], &line) + || !ParseNaturalNumber(fields[2], &index) + || !ParseNaturalNumber(fields[3], &parent_process_id) + || !ParseNaturalNumber(fields[4], &write_handle_as_size_t) + || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG(internal_run_death_test)); + } + write_fd = GetStatusFileDescriptor(parent_process_id, + write_handle_as_size_t, + event_handle_as_size_t); +# else + + if (fields.size() != 4 + || !ParseNaturalNumber(fields[1], &line) + || !ParseNaturalNumber(fields[2], &index) + || !ParseNaturalNumber(fields[3], &write_fd)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG(internal_run_death_test)); + } + +# endif // GTEST_OS_WINDOWS + + return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); +} + +} // namespace internal + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc new file mode 100644 index 0000000000..0292dc1195 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc @@ -0,0 +1,387 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: keith.ray@gmail.com (Keith Ray) + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-filepath.h" +#include "gtest/internal/gtest-port.h" + +#include + +#if GTEST_OS_WINDOWS_MOBILE +# include +#elif GTEST_OS_WINDOWS +# include +# include +#elif GTEST_OS_SYMBIAN +// Symbian OpenC has PATH_MAX in sys/syslimits.h +# include +#else +# include +# include // Some Linux distributions define PATH_MAX here. +#endif // GTEST_OS_WINDOWS_MOBILE + +#if GTEST_OS_WINDOWS +# define GTEST_PATH_MAX_ _MAX_PATH +#elif defined(PATH_MAX) +# define GTEST_PATH_MAX_ PATH_MAX +#elif defined(_XOPEN_PATH_MAX) +# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX +#else +# define GTEST_PATH_MAX_ _POSIX_PATH_MAX +#endif // GTEST_OS_WINDOWS + +#include "gtest/internal/gtest-string.h" + +namespace testing { +namespace internal { + +#if GTEST_OS_WINDOWS +// On Windows, '\\' is the standard path separator, but many tools and the +// Windows API also accept '/' as an alternate path separator. Unless otherwise +// noted, a file path can contain either kind of path separators, or a mixture +// of them. +const char kPathSeparator = '\\'; +const char kAlternatePathSeparator = '/'; +const char kAlternatePathSeparatorString[] = "/"; +# if GTEST_OS_WINDOWS_MOBILE +// Windows CE doesn't have a current directory. You should not use +// the current directory in tests on Windows CE, but this at least +// provides a reasonable fallback. +const char kCurrentDirectoryString[] = "\\"; +// Windows CE doesn't define INVALID_FILE_ATTRIBUTES +const DWORD kInvalidFileAttributes = 0xffffffff; +# else +const char kCurrentDirectoryString[] = ".\\"; +# endif // GTEST_OS_WINDOWS_MOBILE +#else +const char kPathSeparator = '/'; +const char kCurrentDirectoryString[] = "./"; +#endif // GTEST_OS_WINDOWS + +// Returns whether the given character is a valid path separator. +static bool IsPathSeparator(char c) { +#if GTEST_HAS_ALT_PATH_SEP_ + return (c == kPathSeparator) || (c == kAlternatePathSeparator); +#else + return c == kPathSeparator; +#endif +} + +// Returns the current working directory, or "" if unsuccessful. +FilePath FilePath::GetCurrentDir() { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT + // Windows CE doesn't have a current directory, so we just return + // something reasonable. + return FilePath(kCurrentDirectoryString); +#elif GTEST_OS_WINDOWS + char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); +#else + char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + char* result = getcwd(cwd, sizeof(cwd)); +# if GTEST_OS_NACL + // getcwd will likely fail in NaCl due to the sandbox, so return something + // reasonable. The user may have provided a shim implementation for getcwd, + // however, so fallback only when failure is detected. + return FilePath(result == NULL ? kCurrentDirectoryString : cwd); +# endif // GTEST_OS_NACL + return FilePath(result == NULL ? "" : cwd); +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns a copy of the FilePath with the case-insensitive extension removed. +// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns +// FilePath("dir/file"). If a case-insensitive extension is not +// found, returns a copy of the original FilePath. +FilePath FilePath::RemoveExtension(const char* extension) const { + const std::string dot_extension = std::string(".") + extension; + if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { + return FilePath(pathname_.substr( + 0, pathname_.length() - dot_extension.length())); + } + return *this; +} + +// Returns a pointer to the last occurence of a valid path separator in +// the FilePath. On Windows, for example, both '/' and '\' are valid path +// separators. Returns NULL if no path separator was found. +const char* FilePath::FindLastPathSeparator() const { + const char* const last_sep = strrchr(c_str(), kPathSeparator); +#if GTEST_HAS_ALT_PATH_SEP_ + const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); + // Comparing two pointers of which only one is NULL is undefined. + if (last_alt_sep != NULL && + (last_sep == NULL || last_alt_sep > last_sep)) { + return last_alt_sep; + } +#endif + return last_sep; +} + +// Returns a copy of the FilePath with the directory part removed. +// Example: FilePath("path/to/file").RemoveDirectoryName() returns +// FilePath("file"). If there is no directory part ("just_a_file"), it returns +// the FilePath unmodified. If there is no file part ("just_a_dir/") it +// returns an empty FilePath (""). +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveDirectoryName() const { + const char* const last_sep = FindLastPathSeparator(); + return last_sep ? FilePath(last_sep + 1) : *this; +} + +// RemoveFileName returns the directory path with the filename removed. +// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". +// If the FilePath is "a_file" or "/a_file", RemoveFileName returns +// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does +// not have a file, like "just/a/dir/", it returns the FilePath unmodified. +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveFileName() const { + const char* const last_sep = FindLastPathSeparator(); + std::string dir; + if (last_sep) { + dir = std::string(c_str(), last_sep + 1 - c_str()); + } else { + dir = kCurrentDirectoryString; + } + return FilePath(dir); +} + +// Helper functions for naming files in a directory for xml output. + +// Given directory = "dir", base_name = "test", number = 0, +// extension = "xml", returns "dir/test.xml". If number is greater +// than zero (e.g., 12), returns "dir/test_12.xml". +// On Windows platform, uses \ as the separator rather than /. +FilePath FilePath::MakeFileName(const FilePath& directory, + const FilePath& base_name, + int number, + const char* extension) { + std::string file; + if (number == 0) { + file = base_name.string() + "." + extension; + } else { + file = base_name.string() + "_" + StreamableToString(number) + + "." + extension; + } + return ConcatPaths(directory, FilePath(file)); +} + +// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml". +// On Windows, uses \ as the separator rather than /. +FilePath FilePath::ConcatPaths(const FilePath& directory, + const FilePath& relative_path) { + if (directory.IsEmpty()) + return relative_path; + const FilePath dir(directory.RemoveTrailingPathSeparator()); + return FilePath(dir.string() + kPathSeparator + relative_path.string()); +} + +// Returns true if pathname describes something findable in the file-system, +// either a file, directory, or whatever. +bool FilePath::FileOrDirectoryExists() const { +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete [] unicode; + return attributes != kInvalidFileAttributes; +#else + posix::StatStruct file_stat; + return posix::Stat(pathname_.c_str(), &file_stat) == 0; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns true if pathname describes a directory in the file-system +// that exists. +bool FilePath::DirectoryExists() const { + bool result = false; +#if GTEST_OS_WINDOWS + // Don't strip off trailing separator if path is a root directory on + // Windows (like "C:\\"). + const FilePath& path(IsRootDirectory() ? *this : + RemoveTrailingPathSeparator()); +#else + const FilePath& path(*this); +#endif + +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete [] unicode; + if ((attributes != kInvalidFileAttributes) && + (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + result = true; + } +#else + posix::StatStruct file_stat; + result = posix::Stat(path.c_str(), &file_stat) == 0 && + posix::IsDir(file_stat); +#endif // GTEST_OS_WINDOWS_MOBILE + + return result; +} + +// Returns true if pathname describes a root directory. (Windows has one +// root directory per disk drive.) +bool FilePath::IsRootDirectory() const { +#if GTEST_OS_WINDOWS + // TODO(wan@google.com): on Windows a network share like + // \\server\share can be a root directory, although it cannot be the + // current directory. Handle this properly. + return pathname_.length() == 3 && IsAbsolutePath(); +#else + return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); +#endif +} + +// Returns true if pathname describes an absolute path. +bool FilePath::IsAbsolutePath() const { + const char* const name = pathname_.c_str(); +#if GTEST_OS_WINDOWS + return pathname_.length() >= 3 && + ((name[0] >= 'a' && name[0] <= 'z') || + (name[0] >= 'A' && name[0] <= 'Z')) && + name[1] == ':' && + IsPathSeparator(name[2]); +#else + return IsPathSeparator(name[0]); +#endif +} + +// Returns a pathname for a file that does not currently exist. The pathname +// will be directory/base_name.extension or +// directory/base_name_.extension if directory/base_name.extension +// already exists. The number will be incremented until a pathname is found +// that does not already exist. +// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. +// There could be a race condition if two or more processes are calling this +// function at the same time -- they could both pick the same filename. +FilePath FilePath::GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension) { + FilePath full_pathname; + int number = 0; + do { + full_pathname.Set(MakeFileName(directory, base_name, number++, extension)); + } while (full_pathname.FileOrDirectoryExists()); + return full_pathname; +} + +// Returns true if FilePath ends with a path separator, which indicates that +// it is intended to represent a directory. Returns false otherwise. +// This does NOT check that a directory (or file) actually exists. +bool FilePath::IsDirectory() const { + return !pathname_.empty() && + IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]); +} + +// Create directories so that path exists. Returns true if successful or if +// the directories already exist; returns false if unable to create directories +// for any reason. +bool FilePath::CreateDirectoriesRecursively() const { + if (!this->IsDirectory()) { + return false; + } + + if (pathname_.length() == 0 || this->DirectoryExists()) { + return true; + } + + const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName()); + return parent.CreateDirectoriesRecursively() && this->CreateFolder(); +} + +// Create the directory so that path exists. Returns true if successful or +// if the directory already exists; returns false if unable to create the +// directory for any reason, including if the parent directory does not +// exist. Not named "CreateDirectory" because that's a macro on Windows. +bool FilePath::CreateFolder() const { +#if GTEST_OS_WINDOWS_MOBILE + FilePath removed_sep(this->RemoveTrailingPathSeparator()); + LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); + int result = CreateDirectory(unicode, NULL) ? 0 : -1; + delete [] unicode; +#elif GTEST_OS_WINDOWS + int result = _mkdir(pathname_.c_str()); +#else + int result = mkdir(pathname_.c_str(), 0777); +#endif // GTEST_OS_WINDOWS_MOBILE + + if (result == -1) { + return this->DirectoryExists(); // An error is OK if the directory exists. + } + return true; // No error. +} + +// If input name has a trailing separator character, remove it and return the +// name, otherwise return the name string unmodified. +// On Windows platform, uses \ as the separator, other platforms use /. +FilePath FilePath::RemoveTrailingPathSeparator() const { + return IsDirectory() + ? FilePath(pathname_.substr(0, pathname_.length() - 1)) + : *this; +} + +// Removes any redundant separators that might be in the pathname. +// For example, "bar///foo" becomes "bar/foo". Does not eliminate other +// redundancies that might be in a pathname involving "." or "..". +// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share). +void FilePath::Normalize() { + if (pathname_.c_str() == NULL) { + pathname_ = ""; + return; + } + const char* src = pathname_.c_str(); + char* const dest = new char[pathname_.length() + 1]; + char* dest_ptr = dest; + memset(dest_ptr, 0, pathname_.length() + 1); + + while (*src != '\0') { + *dest_ptr = *src; + if (!IsPathSeparator(*src)) { + src++; + } else { +#if GTEST_HAS_ALT_PATH_SEP_ + if (*dest_ptr == kAlternatePathSeparator) { + *dest_ptr = kPathSeparator; + } +#endif + while (IsPathSeparator(*src)) + src++; + } + dest_ptr++; + } + *dest_ptr = '\0'; + pathname_ = dest; + delete[] dest; +} + +} // namespace internal +} // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h new file mode 100644 index 0000000000..ed8a682a96 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h @@ -0,0 +1,1183 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Utility functions and classes used by the Google C++ testing framework. +// +// Author: wan@google.com (Zhanyong Wan) +// +// This file contains purely Google Test's internal implementation. Please +// DO NOT #INCLUDE IT IN A USER PROGRAM. + +#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_ +#define GTEST_SRC_GTEST_INTERNAL_INL_H_ + +// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is +// part of Google Test's implementation; otherwise it's undefined. +#if !GTEST_IMPLEMENTATION_ +// If this file is included from the user's code, just say no. +# error "gtest-internal-inl.h is part of Google Test's internal implementation." +# error "It must not be included except by Google Test itself." +#endif // GTEST_IMPLEMENTATION_ + +#ifndef _WIN32_WCE +# include +#endif // !_WIN32_WCE +#include +#include // For strtoll/_strtoul64/malloc/free. +#include // For memmove. + +#include +#include +#include + +#include "gtest/internal/gtest-port.h" + +#if GTEST_CAN_STREAM_RESULTS_ +# include // NOLINT +# include // NOLINT +#endif + +#if GTEST_OS_WINDOWS +# include // NOLINT +#endif // GTEST_OS_WINDOWS + +#include "gtest/gtest.h" // NOLINT +#include "gtest/gtest-spi.h" + +namespace testing { + +// Declares the flags. +// +// We don't want the users to modify this flag in the code, but want +// Google Test's own unit tests to be able to access it. Therefore we +// declare it here as opposed to in gtest.h. +GTEST_DECLARE_bool_(death_test_use_fork); + +namespace internal { + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; + +// Names of the flags (needed for parsing Google Test flags). +const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; +const char kBreakOnFailureFlag[] = "break_on_failure"; +const char kCatchExceptionsFlag[] = "catch_exceptions"; +const char kColorFlag[] = "color"; +const char kFilterFlag[] = "filter"; +const char kListTestsFlag[] = "list_tests"; +const char kOutputFlag[] = "output"; +const char kPrintTimeFlag[] = "print_time"; +const char kRandomSeedFlag[] = "random_seed"; +const char kRepeatFlag[] = "repeat"; +const char kShuffleFlag[] = "shuffle"; +const char kStackTraceDepthFlag[] = "stack_trace_depth"; +const char kStreamResultToFlag[] = "stream_result_to"; +const char kThrowOnFailureFlag[] = "throw_on_failure"; +const char kFlagfileFlag[] = "flagfile"; + +// A valid random seed must be in [1, kMaxRandomSeed]. +const int kMaxRandomSeed = 99999; + +// g_help_flag is true iff the --help flag or an equivalent form is +// specified on the command line. +GTEST_API_ extern bool g_help_flag; + +// Returns the current time in milliseconds. +GTEST_API_ TimeInMillis GetTimeInMillis(); + +// Returns true iff Google Test should use colors in the output. +GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); + +// Formats the given time in milliseconds as seconds. +GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms); + +// Converts the given time in milliseconds to a date string in the ISO 8601 +// format, without the timezone information. N.B.: due to the use the +// non-reentrant localtime() function, this function is not thread safe. Do +// not use it in any code that can be called from multiple threads. +GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); + +// Parses a string for an Int32 flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +GTEST_API_ bool ParseInt32Flag( + const char* str, const char* flag, Int32* value); + +// Returns a random seed in range [1, kMaxRandomSeed] based on the +// given --gtest_random_seed flag value. +inline int GetRandomSeedFromFlag(Int32 random_seed_flag) { + const unsigned int raw_seed = (random_seed_flag == 0) ? + static_cast(GetTimeInMillis()) : + static_cast(random_seed_flag); + + // Normalizes the actual seed to range [1, kMaxRandomSeed] such that + // it's easy to type. + const int normalized_seed = + static_cast((raw_seed - 1U) % + static_cast(kMaxRandomSeed)) + 1; + return normalized_seed; +} + +// Returns the first valid random seed after 'seed'. The behavior is +// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is +// considered to be 1. +inline int GetNextRandomSeed(int seed) { + GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed) + << "Invalid random seed " << seed << " - must be in [1, " + << kMaxRandomSeed << "]."; + const int next_seed = seed + 1; + return (next_seed > kMaxRandomSeed) ? 1 : next_seed; +} + +// This class saves the values of all Google Test flags in its c'tor, and +// restores them in its d'tor. +class GTestFlagSaver { + public: + // The c'tor. + GTestFlagSaver() { + also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests); + break_on_failure_ = GTEST_FLAG(break_on_failure); + catch_exceptions_ = GTEST_FLAG(catch_exceptions); + color_ = GTEST_FLAG(color); + death_test_style_ = GTEST_FLAG(death_test_style); + death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); + filter_ = GTEST_FLAG(filter); + internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); + list_tests_ = GTEST_FLAG(list_tests); + output_ = GTEST_FLAG(output); + print_time_ = GTEST_FLAG(print_time); + random_seed_ = GTEST_FLAG(random_seed); + repeat_ = GTEST_FLAG(repeat); + shuffle_ = GTEST_FLAG(shuffle); + stack_trace_depth_ = GTEST_FLAG(stack_trace_depth); + stream_result_to_ = GTEST_FLAG(stream_result_to); + throw_on_failure_ = GTEST_FLAG(throw_on_failure); + } + + // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. + ~GTestFlagSaver() { + GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_; + GTEST_FLAG(break_on_failure) = break_on_failure_; + GTEST_FLAG(catch_exceptions) = catch_exceptions_; + GTEST_FLAG(color) = color_; + GTEST_FLAG(death_test_style) = death_test_style_; + GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; + GTEST_FLAG(filter) = filter_; + GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; + GTEST_FLAG(list_tests) = list_tests_; + GTEST_FLAG(output) = output_; + GTEST_FLAG(print_time) = print_time_; + GTEST_FLAG(random_seed) = random_seed_; + GTEST_FLAG(repeat) = repeat_; + GTEST_FLAG(shuffle) = shuffle_; + GTEST_FLAG(stack_trace_depth) = stack_trace_depth_; + GTEST_FLAG(stream_result_to) = stream_result_to_; + GTEST_FLAG(throw_on_failure) = throw_on_failure_; + } + + private: + // Fields for saving the original values of flags. + bool also_run_disabled_tests_; + bool break_on_failure_; + bool catch_exceptions_; + std::string color_; + std::string death_test_style_; + bool death_test_use_fork_; + std::string filter_; + std::string internal_run_death_test_; + bool list_tests_; + std::string output_; + bool print_time_; + internal::Int32 random_seed_; + internal::Int32 repeat_; + bool shuffle_; + internal::Int32 stack_trace_depth_; + std::string stream_result_to_; + bool throw_on_failure_; +} GTEST_ATTRIBUTE_UNUSED_; + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +GTEST_API_ std::string CodePointToUtf8(UInt32 code_point); + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars); + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded(); + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (e.g., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +GTEST_API_ bool ShouldShard(const char* total_shards_str, + const char* shard_index_str, + bool in_subprocess_for_death_test); + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error and +// and aborts. +GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val); + +// Given the total number of shards, the shard index, and the test id, +// returns true iff the test should be run on this shard. The test id is +// some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +GTEST_API_ bool ShouldRunTestOnShard( + int total_shards, int shard_index, int test_id); + +// STL container utilities. + +// Returns the number of elements in the given container that satisfy +// the given predicate. +template +inline int CountIf(const Container& c, Predicate predicate) { + // Implemented as an explicit loop since std::count_if() in libCstd on + // Solaris has a non-standard signature. + int count = 0; + for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) { + if (predicate(*it)) + ++count; + } + return count; +} + +// Applies a function/functor to each element in the container. +template +void ForEach(const Container& c, Functor functor) { + std::for_each(c.begin(), c.end(), functor); +} + +// Returns the i-th element of the vector, or default_value if i is not +// in range [0, v.size()). +template +inline E GetElementOr(const std::vector& v, int i, E default_value) { + return (i < 0 || i >= static_cast(v.size())) ? default_value : v[i]; +} + +// Performs an in-place shuffle of a range of the vector's elements. +// 'begin' and 'end' are element indices as an STL-style range; +// i.e. [begin, end) are shuffled, where 'end' == size() means to +// shuffle to the end of the vector. +template +void ShuffleRange(internal::Random* random, int begin, int end, + std::vector* v) { + const int size = static_cast(v->size()); + GTEST_CHECK_(0 <= begin && begin <= size) + << "Invalid shuffle range start " << begin << ": must be in range [0, " + << size << "]."; + GTEST_CHECK_(begin <= end && end <= size) + << "Invalid shuffle range finish " << end << ": must be in range [" + << begin << ", " << size << "]."; + + // Fisher-Yates shuffle, from + // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle + for (int range_width = end - begin; range_width >= 2; range_width--) { + const int last_in_range = begin + range_width - 1; + const int selected = begin + random->Generate(range_width); + std::swap((*v)[selected], (*v)[last_in_range]); + } +} + +// Performs an in-place shuffle of the vector's elements. +template +inline void Shuffle(internal::Random* random, std::vector* v) { + ShuffleRange(random, 0, static_cast(v->size()), v); +} + +// A function for deleting an object. Handy for being used as a +// functor. +template +static void Delete(T* x) { + delete x; +} + +// A predicate that checks the key of a TestProperty against a known key. +// +// TestPropertyKeyIs is copyable. +class TestPropertyKeyIs { + public: + // Constructor. + // + // TestPropertyKeyIs has NO default constructor. + explicit TestPropertyKeyIs(const std::string& key) : key_(key) {} + + // Returns true iff the test name of test property matches on key_. + bool operator()(const TestProperty& test_property) const { + return test_property.key() == key_; + } + + private: + std::string key_; +}; + +// Class UnitTestOptions. +// +// This class contains functions for processing options the user +// specifies when running the tests. It has only static members. +// +// In most cases, the user can specify an option using either an +// environment variable or a command line flag. E.g. you can set the +// test filter using either GTEST_FILTER or --gtest_filter. If both +// the variable and the flag are present, the latter overrides the +// former. +class GTEST_API_ UnitTestOptions { + public: + // Functions for processing the gtest_output flag. + + // Returns the output format, or "" for normal printed output. + static std::string GetOutputFormat(); + + // Returns the absolute path of the requested output file, or the + // default (test_detail.xml in the original working directory) if + // none was explicitly specified. + static std::string GetAbsolutePathToOutputFile(); + + // Functions for processing the gtest_filter flag. + + // Returns true iff the wildcard pattern matches the string. The + // first ':' or '\0' character in pattern marks the end of it. + // + // This recursive algorithm isn't very efficient, but is clear and + // works well enough for matching test names, which are short. + static bool PatternMatchesString(const char *pattern, const char *str); + + // Returns true iff the user-specified filter matches the test case + // name and the test name. + static bool FilterMatchesTest(const std::string &test_case_name, + const std::string &test_name); + +#if GTEST_OS_WINDOWS + // Function for supporting the gtest_catch_exception flag. + + // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the + // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. + // This function is useful as an __except condition. + static int GTestShouldProcessSEH(DWORD exception_code); +#endif // GTEST_OS_WINDOWS + + // Returns true if "name" matches the ':' separated list of glob-style + // filters in "filter". + static bool MatchesFilter(const std::string& name, const char* filter); +}; + +// Returns the current application's name, removing directory path if that +// is present. Used by UnitTestOptions::GetOutputFile. +GTEST_API_ FilePath GetCurrentExecutableName(); + +// The role interface for getting the OS stack trace as a string. +class OsStackTraceGetterInterface { + public: + OsStackTraceGetterInterface() {} + virtual ~OsStackTraceGetterInterface() {} + + // Returns the current OS stack trace as an std::string. Parameters: + // + // max_depth - the maximum number of stack frames to be included + // in the trace. + // skip_count - the number of top frames to be skipped; doesn't count + // against max_depth. + virtual string CurrentStackTrace(int max_depth, int skip_count) = 0; + + // UponLeavingGTest() should be called immediately before Google Test calls + // user code. It saves some information about the current stack that + // CurrentStackTrace() will use to find and hide Google Test stack frames. + virtual void UponLeavingGTest() = 0; + + // This string is inserted in place of stack frames that are part of + // Google Test's implementation. + static const char* const kElidedFramesMarker; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface); +}; + +// A working implementation of the OsStackTraceGetterInterface interface. +class OsStackTraceGetter : public OsStackTraceGetterInterface { + public: + OsStackTraceGetter() {} + + virtual string CurrentStackTrace(int max_depth, int skip_count); + virtual void UponLeavingGTest(); + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); +}; + +// Information about a Google Test trace point. +struct TraceInfo { + const char* file; + int line; + std::string message; +}; + +// This is the default global test part result reporter used in UnitTestImpl. +// This class should only be used by UnitTestImpl. +class DefaultGlobalTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. Reports the test part + // result in the current test. + virtual void ReportTestPartResult(const TestPartResult& result); + + private: + UnitTestImpl* const unit_test_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter); +}; + +// This is the default per thread test part result reporter used in +// UnitTestImpl. This class should only be used by UnitTestImpl. +class DefaultPerThreadTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. The implementation just + // delegates to the current global test part result reporter of *unit_test_. + virtual void ReportTestPartResult(const TestPartResult& result); + + private: + UnitTestImpl* const unit_test_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter); +}; + +// The private implementation of the UnitTest class. We don't protect +// the methods under a mutex, as this class is not accessible by a +// user and the UnitTest class that delegates work to this class does +// proper locking. +class GTEST_API_ UnitTestImpl { + public: + explicit UnitTestImpl(UnitTest* parent); + virtual ~UnitTestImpl(); + + // There are two different ways to register your own TestPartResultReporter. + // You can register your own repoter to listen either only for test results + // from the current thread or for results from all threads. + // By default, each per-thread test result repoter just passes a new + // TestPartResult to the global test result reporter, which registers the + // test part result for the currently running test. + + // Returns the global test part result reporter. + TestPartResultReporterInterface* GetGlobalTestPartResultReporter(); + + // Sets the global test part result reporter. + void SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter); + + // Returns the test part result reporter for the current thread. + TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread(); + + // Sets the test part result reporter for the current thread. + void SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter); + + // Gets the number of successful test cases. + int successful_test_case_count() const; + + // Gets the number of failed test cases. + int failed_test_case_count() const; + + // Gets the number of all test cases. + int total_test_case_count() const; + + // Gets the number of all test cases that contain at least one test + // that should run. + int test_case_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns true iff the unit test passed (i.e. all test cases passed). + bool Passed() const { return !Failed(); } + + // Returns true iff the unit test failed (i.e. some test case failed + // or something outside of all tests failed). + bool Failed() const { + return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed(); + } + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + const TestCase* GetTestCase(int i) const { + const int index = GetElementOr(test_case_indices_, i, -1); + return index < 0 ? NULL : test_cases_[i]; + } + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + TestCase* GetMutableTestCase(int i) { + const int index = GetElementOr(test_case_indices_, i, -1); + return index < 0 ? NULL : test_cases_[index]; + } + + // Provides access to the event listener list. + TestEventListeners* listeners() { return &listeners_; } + + // Returns the TestResult for the test that's currently running, or + // the TestResult for the ad hoc test if no test is running. + TestResult* current_test_result(); + + // Returns the TestResult for the ad hoc test. + const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; } + + // Sets the OS stack trace getter. + // + // Does nothing if the input and the current OS stack trace getter + // are the same; otherwise, deletes the old getter and makes the + // input the current getter. + void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter); + + // Returns the current OS stack trace getter if it is not NULL; + // otherwise, creates an OsStackTraceGetter, makes it the current + // getter, and returns it. + OsStackTraceGetterInterface* os_stack_trace_getter(); + + // Returns the current OS stack trace as an std::string. + // + // The maximum number of stack frames to be included is specified by + // the gtest_stack_trace_depth flag. The skip_count parameter + // specifies the number of top frames to be skipped, which doesn't + // count against the number of frames to be included. + // + // For example, if Foo() calls Bar(), which in turn calls + // CurrentOsStackTraceExceptTop(1), Foo() will be included in the + // trace but Bar() and CurrentOsStackTraceExceptTop() won't. + std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_; + + // Finds and returns a TestCase with the given name. If one doesn't + // exist, creates one and returns it. + // + // Arguments: + // + // test_case_name: name of the test case + // type_param: the name of the test's type parameter, or NULL if + // this is not a typed or a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + TestCase* GetTestCase(const char* test_case_name, + const char* type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc); + + // Adds a TestInfo to the unit test. + // + // Arguments: + // + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + // test_info: the TestInfo object + void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc, + TestInfo* test_info) { + // In order to support thread-safe death tests, we need to + // remember the original working directory when the test program + // was first invoked. We cannot do this in RUN_ALL_TESTS(), as + // the user may have changed the current directory before calling + // RUN_ALL_TESTS(). Therefore we capture the current directory in + // AddTestInfo(), which is called to register a TEST or TEST_F + // before main() is reached. + if (original_working_dir_.IsEmpty()) { + original_working_dir_.Set(FilePath::GetCurrentDir()); + GTEST_CHECK_(!original_working_dir_.IsEmpty()) + << "Failed to get the current working directory."; + } + + GetTestCase(test_info->test_case_name(), + test_info->type_param(), + set_up_tc, + tear_down_tc)->AddTestInfo(test_info); + } + +#if GTEST_HAS_PARAM_TEST + // Returns ParameterizedTestCaseRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + internal::ParameterizedTestCaseRegistry& parameterized_test_registry() { + return parameterized_test_registry_; + } +#endif // GTEST_HAS_PARAM_TEST + + // Sets the TestCase object for the test that's currently running. + void set_current_test_case(TestCase* a_current_test_case) { + current_test_case_ = a_current_test_case; + } + + // Sets the TestInfo object for the test that's currently running. If + // current_test_info is NULL, the assertion results will be stored in + // ad_hoc_test_result_. + void set_current_test_info(TestInfo* a_current_test_info) { + current_test_info_ = a_current_test_info; + } + + // Registers all parameterized tests defined using TEST_P and + // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter + // combination. This method can be called more then once; it has guards + // protecting from registering the tests more then once. If + // value-parameterized tests are disabled, RegisterParameterizedTests is + // present but does nothing. + void RegisterParameterizedTests(); + + // Runs all tests in this UnitTest object, prints the result, and + // returns true if all tests are successful. If any exception is + // thrown during a test, this test is considered to be failed, but + // the rest of the tests will still be run. + bool RunAllTests(); + + // Clears the results of all tests, except the ad hoc tests. + void ClearNonAdHocTestResult() { + ForEach(test_cases_, TestCase::ClearTestCaseResult); + } + + // Clears the results of ad-hoc test assertions. + void ClearAdHocTestResult() { + ad_hoc_test_result_.Clear(); + } + + // Adds a TestProperty to the current TestResult object when invoked in a + // context of a test or a test case, or to the global property set. If the + // result already contains a property with the same key, the value will be + // updated. + void RecordProperty(const TestProperty& test_property); + + enum ReactionToSharding { + HONOR_SHARDING_PROTOCOL, + IGNORE_SHARDING_PROTOCOL + }; + + // Matches the full name of each test against the user-specified + // filter to decide whether the test should run, then records the + // result in each TestCase and TestInfo object. + // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests + // based on sharding variables in the environment. + // Returns the number of tests that should run. + int FilterTests(ReactionToSharding shard_tests); + + // Prints the names of the tests matching the user-specified filter flag. + void ListTestsMatchingFilter(); + + const TestCase* current_test_case() const { return current_test_case_; } + TestInfo* current_test_info() { return current_test_info_; } + const TestInfo* current_test_info() const { return current_test_info_; } + + // Returns the vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector& environments() { return environments_; } + + // Getters for the per-thread Google Test trace stack. + std::vector& gtest_trace_stack() { + return *(gtest_trace_stack_.pointer()); + } + const std::vector& gtest_trace_stack() const { + return gtest_trace_stack_.get(); + } + +#if GTEST_HAS_DEATH_TEST + void InitDeathTestSubprocessControlInfo() { + internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag()); + } + // Returns a pointer to the parsed --gtest_internal_run_death_test + // flag, or NULL if that flag was not specified. + // This information is useful only in a death test child process. + // Must not be called before a call to InitGoogleTest. + const InternalRunDeathTestFlag* internal_run_death_test_flag() const { + return internal_run_death_test_flag_.get(); + } + + // Returns a pointer to the current death test factory. + internal::DeathTestFactory* death_test_factory() { + return death_test_factory_.get(); + } + + void SuppressTestEventsIfInSubprocess(); + + friend class ReplaceDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + + // Initializes the event listener performing XML output as specified by + // UnitTestOptions. Must not be called before InitGoogleTest. + void ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Initializes the event listener for streaming test results to a socket. + // Must not be called before InitGoogleTest. + void ConfigureStreamingOutput(); +#endif + + // Performs initialization dependent upon flag values obtained in + // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to + // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest + // this function is also called from RunAllTests. Since this function can be + // called more than once, it has to be idempotent. + void PostFlagParsingInit(); + + // Gets the random seed used at the start of the current test iteration. + int random_seed() const { return random_seed_; } + + // Gets the random number generator. + internal::Random* random() { return &random_; } + + // Shuffles all test cases, and the tests within each test case, + // making sure that death tests are still run first. + void ShuffleTests(); + + // Restores the test cases and tests to their order before the first shuffle. + void UnshuffleTests(); + + // Returns the value of GTEST_FLAG(catch_exceptions) at the moment + // UnitTest::Run() starts. + bool catch_exceptions() const { return catch_exceptions_; } + + private: + friend class ::testing::UnitTest; + + // Used by UnitTest::Run() to capture the state of + // GTEST_FLAG(catch_exceptions) at the moment it starts. + void set_catch_exceptions(bool value) { catch_exceptions_ = value; } + + // The UnitTest object that owns this implementation object. + UnitTest* const parent_; + + // The working directory when the first TEST() or TEST_F() was + // executed. + internal::FilePath original_working_dir_; + + // The default test part result reporters. + DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_; + DefaultPerThreadTestPartResultReporter + default_per_thread_test_part_result_reporter_; + + // Points to (but doesn't own) the global test part result reporter. + TestPartResultReporterInterface* global_test_part_result_repoter_; + + // Protects read and write access to global_test_part_result_reporter_. + internal::Mutex global_test_part_result_reporter_mutex_; + + // Points to (but doesn't own) the per-thread test part result reporter. + internal::ThreadLocal + per_thread_test_part_result_reporter_; + + // The vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector environments_; + + // The vector of TestCases in their original order. It owns the + // elements in the vector. + std::vector test_cases_; + + // Provides a level of indirection for the test case list to allow + // easy shuffling and restoring the test case order. The i-th + // element of this vector is the index of the i-th test case in the + // shuffled order. + std::vector test_case_indices_; + +#if GTEST_HAS_PARAM_TEST + // ParameterizedTestRegistry object used to register value-parameterized + // tests. + internal::ParameterizedTestCaseRegistry parameterized_test_registry_; + + // Indicates whether RegisterParameterizedTests() has been called already. + bool parameterized_tests_registered_; +#endif // GTEST_HAS_PARAM_TEST + + // Index of the last death test case registered. Initially -1. + int last_death_test_case_; + + // This points to the TestCase for the currently running test. It + // changes as Google Test goes through one test case after another. + // When no test is running, this is set to NULL and Google Test + // stores assertion results in ad_hoc_test_result_. Initially NULL. + TestCase* current_test_case_; + + // This points to the TestInfo for the currently running test. It + // changes as Google Test goes through one test after another. When + // no test is running, this is set to NULL and Google Test stores + // assertion results in ad_hoc_test_result_. Initially NULL. + TestInfo* current_test_info_; + + // Normally, a user only writes assertions inside a TEST or TEST_F, + // or inside a function called by a TEST or TEST_F. Since Google + // Test keeps track of which test is current running, it can + // associate such an assertion with the test it belongs to. + // + // If an assertion is encountered when no TEST or TEST_F is running, + // Google Test attributes the assertion result to an imaginary "ad hoc" + // test, and records the result in ad_hoc_test_result_. + TestResult ad_hoc_test_result_; + + // The list of event listeners that can be used to track events inside + // Google Test. + TestEventListeners listeners_; + + // The OS stack trace getter. Will be deleted when the UnitTest + // object is destructed. By default, an OsStackTraceGetter is used, + // but the user can set this field to use a custom getter if that is + // desired. + OsStackTraceGetterInterface* os_stack_trace_getter_; + + // True iff PostFlagParsingInit() has been called. + bool post_flag_parse_init_performed_; + + // The random number seed used at the beginning of the test run. + int random_seed_; + + // Our random number generator. + internal::Random random_; + + // The time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp_; + + // How long the test took to run, in milliseconds. + TimeInMillis elapsed_time_; + +#if GTEST_HAS_DEATH_TEST + // The decomposed components of the gtest_internal_run_death_test flag, + // parsed when RUN_ALL_TESTS is called. + internal::scoped_ptr internal_run_death_test_flag_; + internal::scoped_ptr death_test_factory_; +#endif // GTEST_HAS_DEATH_TEST + + // A per-thread stack of traces created by the SCOPED_TRACE() macro. + internal::ThreadLocal > gtest_trace_stack_; + + // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests() + // starts. + bool catch_exceptions_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl); +}; // class UnitTestImpl + +// Convenience function for accessing the global UnitTest +// implementation object. +inline UnitTestImpl* GetUnitTestImpl() { + return UnitTest::GetInstance()->impl(); +} + +#if GTEST_USES_SIMPLE_RE + +// Internal helper functions for implementing the simple regular +// expression matcher. +GTEST_API_ bool IsInSet(char ch, const char* str); +GTEST_API_ bool IsAsciiDigit(char ch); +GTEST_API_ bool IsAsciiPunct(char ch); +GTEST_API_ bool IsRepeat(char ch); +GTEST_API_ bool IsAsciiWhiteSpace(char ch); +GTEST_API_ bool IsAsciiWordChar(char ch); +GTEST_API_ bool IsValidEscape(char ch); +GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); +GTEST_API_ bool ValidateRegex(const char* regex); +GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); +GTEST_API_ bool MatchRepetitionAndRegexAtHead( + bool escaped, char ch, char repeat, const char* regex, const char* str); +GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); + +#endif // GTEST_USES_SIMPLE_RE + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv); +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv); + +#if GTEST_HAS_DEATH_TEST + +// Returns the message describing the last system error, regardless of the +// platform. +GTEST_API_ std::string GetLastErrnoDescription(); + +// Attempts to parse a string into a positive integer pointed to by the +// number parameter. Returns true if that is possible. +// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use +// it here. +template +bool ParseNaturalNumber(const ::std::string& str, Integer* number) { + // Fail fast if the given string does not begin with a digit; + // this bypasses strtoXXX's "optional leading whitespace and plus + // or minus sign" semantics, which are undesirable here. + if (str.empty() || !IsDigit(str[0])) { + return false; + } + errno = 0; + + char* end; + // BiggestConvertible is the largest integer type that system-provided + // string-to-number conversion routines can return. + +# if GTEST_OS_WINDOWS && !defined(__GNUC__) + + // MSVC and C++ Builder define __int64 instead of the standard long long. + typedef unsigned __int64 BiggestConvertible; + const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10); + +# else + + typedef unsigned long long BiggestConvertible; // NOLINT + const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); + +# endif // GTEST_OS_WINDOWS && !defined(__GNUC__) + + const bool parse_success = *end == '\0' && errno == 0; + + // TODO(vladl@google.com): Convert this to compile time assertion when it is + // available. + GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); + + const Integer result = static_cast(parsed); + if (parse_success && static_cast(result) == parsed) { + *number = result; + return true; + } + return false; +} +#endif // GTEST_HAS_DEATH_TEST + +// TestResult contains some private methods that should be hidden from +// Google Test user but are required for testing. This class allow our tests +// to access them. +// +// This class is supplied only for the purpose of testing Google Test's own +// constructs. Do not use it in user tests, either directly or indirectly. +class TestResultAccessor { + public: + static void RecordProperty(TestResult* test_result, + const std::string& xml_element, + const TestProperty& property) { + test_result->RecordProperty(xml_element, property); + } + + static void ClearTestPartResults(TestResult* test_result) { + test_result->ClearTestPartResults(); + } + + static const std::vector& test_part_results( + const TestResult& test_result) { + return test_result.test_part_results(); + } +}; + +#if GTEST_CAN_STREAM_RESULTS_ + +// Streams test results to the given port on the given host machine. +class GTEST_API_ StreamingListener : public EmptyTestEventListener { + public: + // Abstract base class for writing strings to a socket. + class AbstractSocketWriter { + public: + virtual ~AbstractSocketWriter() {} + + // Sends a string to the socket. + virtual void Send(const string& message) = 0; + + // Closes the socket. + virtual void CloseConnection() {} + + // Sends a string and a newline to the socket. + void SendLn(const string& message) { + Send(message + "\n"); + } + }; + + // Concrete class for actually writing strings to a socket. + class SocketWriter : public AbstractSocketWriter { + public: + SocketWriter(const string& host, const string& port) + : sockfd_(-1), host_name_(host), port_num_(port) { + MakeConnection(); + } + + virtual ~SocketWriter() { + if (sockfd_ != -1) + CloseConnection(); + } + + // Sends a string to the socket. + virtual void Send(const string& message) { + GTEST_CHECK_(sockfd_ != -1) + << "Send() can be called only when there is a connection."; + + const int len = static_cast(message.length()); + if (write(sockfd_, message.c_str(), len) != len) { + GTEST_LOG_(WARNING) + << "stream_result_to: failed to stream to " + << host_name_ << ":" << port_num_; + } + } + + private: + // Creates a client socket and connects to the server. + void MakeConnection(); + + // Closes the socket. + void CloseConnection() { + GTEST_CHECK_(sockfd_ != -1) + << "CloseConnection() can be called only when there is a connection."; + + close(sockfd_); + sockfd_ = -1; + } + + int sockfd_; // socket file descriptor + const string host_name_; + const string port_num_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter); + }; // class SocketWriter + + // Escapes '=', '&', '%', and '\n' characters in str as "%xx". + static string UrlEncode(const char* str); + + StreamingListener(const string& host, const string& port) + : socket_writer_(new SocketWriter(host, port)) { Start(); } + + explicit StreamingListener(AbstractSocketWriter* socket_writer) + : socket_writer_(socket_writer) { Start(); } + + void OnTestProgramStart(const UnitTest& /* unit_test */) { + SendLn("event=TestProgramStart"); + } + + void OnTestProgramEnd(const UnitTest& unit_test) { + // Note that Google Test current only report elapsed time for each + // test iteration, not for the entire test program. + SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed())); + + // Notify the streaming server to stop. + socket_writer_->CloseConnection(); + } + + void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) { + SendLn("event=TestIterationStart&iteration=" + + StreamableToString(iteration)); + } + + void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) { + SendLn("event=TestIterationEnd&passed=" + + FormatBool(unit_test.Passed()) + "&elapsed_time=" + + StreamableToString(unit_test.elapsed_time()) + "ms"); + } + + void OnTestCaseStart(const TestCase& test_case) { + SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); + } + + void OnTestCaseEnd(const TestCase& test_case) { + SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) + + "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) + + "ms"); + } + + void OnTestStart(const TestInfo& test_info) { + SendLn(std::string("event=TestStart&name=") + test_info.name()); + } + + void OnTestEnd(const TestInfo& test_info) { + SendLn("event=TestEnd&passed=" + + FormatBool((test_info.result())->Passed()) + + "&elapsed_time=" + + StreamableToString((test_info.result())->elapsed_time()) + "ms"); + } + + void OnTestPartResult(const TestPartResult& test_part_result) { + const char* file_name = test_part_result.file_name(); + if (file_name == NULL) + file_name = ""; + SendLn("event=TestPartResult&file=" + UrlEncode(file_name) + + "&line=" + StreamableToString(test_part_result.line_number()) + + "&message=" + UrlEncode(test_part_result.message())); + } + + private: + // Sends the given message and a newline to the socket. + void SendLn(const string& message) { socket_writer_->SendLn(message); } + + // Called at the start of streaming to notify the receiver what + // protocol we are using. + void Start() { SendLn("gtest_streaming_protocol_version=1.0"); } + + string FormatBool(bool value) { return value ? "1" : "0"; } + + const scoped_ptr socket_writer_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); +}; // class StreamingListener + +#endif // GTEST_CAN_STREAM_RESULTS_ + +} // namespace internal +} // namespace testing + +#endif // GTEST_SRC_GTEST_INTERNAL_INL_H_ diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-port.cc b/libs/libvpx/third_party/googletest/src/src/gtest-port.cc new file mode 100644 index 0000000000..e5bf3dd2be --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/src/gtest-port.cc @@ -0,0 +1,1259 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +#include "gtest/internal/gtest-port.h" + +#include +#include +#include +#include +#include + +#if GTEST_OS_WINDOWS +# include +# include +# include +# include // Used in ThreadLocal. +#else +# include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_MAC +# include +# include +# include +#endif // GTEST_OS_MAC + +#if GTEST_OS_QNX +# include +# include +# include +#endif // GTEST_OS_QNX + +#if GTEST_OS_AIX +# include +# include +#endif // GTEST_OS_AIX + +#include "gtest/gtest-spi.h" +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick exists to +// prevent the accidental inclusion of gtest-internal-inl.h in the +// user's code. +#define GTEST_IMPLEMENTATION_ 1 +#include "src/gtest-internal-inl.h" +#undef GTEST_IMPLEMENTATION_ + +namespace testing { +namespace internal { + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC and C++Builder do not provide a definition of STDERR_FILENO. +const int kStdOutFileno = 1; +const int kStdErrFileno = 2; +#else +const int kStdOutFileno = STDOUT_FILENO; +const int kStdErrFileno = STDERR_FILENO; +#endif // _MSC_VER + +#if GTEST_OS_LINUX + +namespace { +template +T ReadProcFileField(const string& filename, int field) { + std::string dummy; + std::ifstream file(filename.c_str()); + while (field-- > 0) { + file >> dummy; + } + T output = 0; + file >> output; + return output; +} +} // namespace + +// Returns the number of active threads, or 0 when there is an error. +size_t GetThreadCount() { + const string filename = + (Message() << "/proc/" << getpid() << "/stat").GetString(); + return ReadProcFileField(filename, 19); +} + +#elif GTEST_OS_MAC + +size_t GetThreadCount() { + const task_t task = mach_task_self(); + mach_msg_type_number_t thread_count; + thread_act_array_t thread_list; + const kern_return_t status = task_threads(task, &thread_list, &thread_count); + if (status == KERN_SUCCESS) { + // task_threads allocates resources in thread_list and we need to free them + // to avoid leaks. + vm_deallocate(task, + reinterpret_cast(thread_list), + sizeof(thread_t) * thread_count); + return static_cast(thread_count); + } else { + return 0; + } +} + +#elif GTEST_OS_QNX + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + const int fd = open("/proc/self/as", O_RDONLY); + if (fd < 0) { + return 0; + } + procfs_info process_info; + const int status = + devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL); + close(fd); + if (status == EOK) { + return static_cast(process_info.num_threads); + } else { + return 0; + } +} + +#elif GTEST_OS_AIX + +size_t GetThreadCount() { + struct procentry64 entry; + pid_t pid = getpid(); + int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1); + if (status == 1) { + return entry.pi_thcount; + } else { + return 0; + } +} + +#else + +size_t GetThreadCount() { + // There's no portable way to detect the number of threads, so we just + // return 0 to indicate that we cannot detect it. + return 0; +} + +#endif // GTEST_OS_LINUX + +#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS + +void SleepMilliseconds(int n) { + ::Sleep(n); +} + +AutoHandle::AutoHandle() + : handle_(INVALID_HANDLE_VALUE) {} + +AutoHandle::AutoHandle(Handle handle) + : handle_(handle) {} + +AutoHandle::~AutoHandle() { + Reset(); +} + +AutoHandle::Handle AutoHandle::Get() const { + return handle_; +} + +void AutoHandle::Reset() { + Reset(INVALID_HANDLE_VALUE); +} + +void AutoHandle::Reset(HANDLE handle) { + // Resetting with the same handle we already own is invalid. + if (handle_ != handle) { + if (IsCloseable()) { + ::CloseHandle(handle_); + } + handle_ = handle; + } else { + GTEST_CHECK_(!IsCloseable()) + << "Resetting a valid handle to itself is likely a programmer error " + "and thus not allowed."; + } +} + +bool AutoHandle::IsCloseable() const { + // Different Windows APIs may use either of these values to represent an + // invalid handle. + return handle_ != NULL && handle_ != INVALID_HANDLE_VALUE; +} + +Notification::Notification() + : event_(::CreateEvent(NULL, // Default security attributes. + TRUE, // Do not reset automatically. + FALSE, // Initially unset. + NULL)) { // Anonymous event. + GTEST_CHECK_(event_.Get() != NULL); +} + +void Notification::Notify() { + GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE); +} + +void Notification::WaitForNotification() { + GTEST_CHECK_( + ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0); +} + +Mutex::Mutex() + : owner_thread_id_(0), + type_(kDynamic), + critical_section_init_phase_(0), + critical_section_(new CRITICAL_SECTION) { + ::InitializeCriticalSection(critical_section_); +} + +Mutex::~Mutex() { + // Static mutexes are leaked intentionally. It is not thread-safe to try + // to clean them up. + // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires + // nothing to clean it up but is available only on Vista and later. + // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx + if (type_ == kDynamic) { + ::DeleteCriticalSection(critical_section_); + delete critical_section_; + critical_section_ = NULL; + } +} + +void Mutex::Lock() { + ThreadSafeLazyInit(); + ::EnterCriticalSection(critical_section_); + owner_thread_id_ = ::GetCurrentThreadId(); +} + +void Mutex::Unlock() { + ThreadSafeLazyInit(); + // We don't protect writing to owner_thread_id_ here, as it's the + // caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + owner_thread_id_ = 0; + ::LeaveCriticalSection(critical_section_); +} + +// Does nothing if the current thread holds the mutex. Otherwise, crashes +// with high probability. +void Mutex::AssertHeld() { + ThreadSafeLazyInit(); + GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId()) + << "The current thread is not holding the mutex @" << this; +} + +// Initializes owner_thread_id_ and critical_section_ in static mutexes. +void Mutex::ThreadSafeLazyInit() { + // Dynamic mutexes are initialized in the constructor. + if (type_ == kStatic) { + switch ( + ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) { + case 0: + // If critical_section_init_phase_ was 0 before the exchange, we + // are the first to test it and need to perform the initialization. + owner_thread_id_ = 0; + critical_section_ = new CRITICAL_SECTION; + ::InitializeCriticalSection(critical_section_); + // Updates the critical_section_init_phase_ to 2 to signal + // initialization complete. + GTEST_CHECK_(::InterlockedCompareExchange( + &critical_section_init_phase_, 2L, 1L) == + 1L); + break; + case 1: + // Somebody else is already initializing the mutex; spin until they + // are done. + while (::InterlockedCompareExchange(&critical_section_init_phase_, + 2L, + 2L) != 2L) { + // Possibly yields the rest of the thread's time slice to other + // threads. + ::Sleep(0); + } + break; + + case 2: + break; // The mutex is already initialized and ready for use. + + default: + GTEST_CHECK_(false) + << "Unexpected value of critical_section_init_phase_ " + << "while initializing a static mutex."; + } + } +} + +namespace { + +class ThreadWithParamSupport : public ThreadWithParamBase { + public: + static HANDLE CreateThread(Runnable* runnable, + Notification* thread_can_start) { + ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start); + DWORD thread_id; + // TODO(yukawa): Consider to use _beginthreadex instead. + HANDLE thread_handle = ::CreateThread( + NULL, // Default security. + 0, // Default stack size. + &ThreadWithParamSupport::ThreadMain, + param, // Parameter to ThreadMainStatic + 0x0, // Default creation flags. + &thread_id); // Need a valid pointer for the call to work under Win98. + GTEST_CHECK_(thread_handle != NULL) << "CreateThread failed with error " + << ::GetLastError() << "."; + if (thread_handle == NULL) { + delete param; + } + return thread_handle; + } + + private: + struct ThreadMainParam { + ThreadMainParam(Runnable* runnable, Notification* thread_can_start) + : runnable_(runnable), + thread_can_start_(thread_can_start) { + } + scoped_ptr runnable_; + // Does not own. + Notification* thread_can_start_; + }; + + static DWORD WINAPI ThreadMain(void* ptr) { + // Transfers ownership. + scoped_ptr param(static_cast(ptr)); + if (param->thread_can_start_ != NULL) + param->thread_can_start_->WaitForNotification(); + param->runnable_->Run(); + return 0; + } + + // Prohibit instantiation. + ThreadWithParamSupport(); + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport); +}; + +} // namespace + +ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable, + Notification* thread_can_start) + : thread_(ThreadWithParamSupport::CreateThread(runnable, + thread_can_start)) { +} + +ThreadWithParamBase::~ThreadWithParamBase() { + Join(); +} + +void ThreadWithParamBase::Join() { + GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0) + << "Failed to join the thread with error " << ::GetLastError() << "."; +} + +// Maps a thread to a set of ThreadIdToThreadLocals that have values +// instantiated on that thread and notifies them when the thread exits. A +// ThreadLocal instance is expected to persist until all threads it has +// values on have terminated. +class ThreadLocalRegistryImpl { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase* GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance) { + DWORD current_thread = ::GetCurrentThreadId(); + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + ThreadIdToThreadLocals::iterator thread_local_pos = + thread_to_thread_locals->find(current_thread); + if (thread_local_pos == thread_to_thread_locals->end()) { + thread_local_pos = thread_to_thread_locals->insert( + std::make_pair(current_thread, ThreadLocalValues())).first; + StartWatcherThreadFor(current_thread); + } + ThreadLocalValues& thread_local_values = thread_local_pos->second; + ThreadLocalValues::iterator value_pos = + thread_local_values.find(thread_local_instance); + if (value_pos == thread_local_values.end()) { + value_pos = + thread_local_values + .insert(std::make_pair( + thread_local_instance, + linked_ptr( + thread_local_instance->NewValueForCurrentThread()))) + .first; + } + return value_pos->second.get(); + } + + static void OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance) { + std::vector > value_holders; + // Clean up the ThreadLocalValues data structure while holding the lock, but + // defer the destruction of the ThreadLocalValueHolderBases. + { + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + for (ThreadIdToThreadLocals::iterator it = + thread_to_thread_locals->begin(); + it != thread_to_thread_locals->end(); + ++it) { + ThreadLocalValues& thread_local_values = it->second; + ThreadLocalValues::iterator value_pos = + thread_local_values.find(thread_local_instance); + if (value_pos != thread_local_values.end()) { + value_holders.push_back(value_pos->second); + thread_local_values.erase(value_pos); + // This 'if' can only be successful at most once, so theoretically we + // could break out of the loop here, but we don't bother doing so. + } + } + } + // Outside the lock, let the destructor for 'value_holders' deallocate the + // ThreadLocalValueHolderBases. + } + + static void OnThreadExit(DWORD thread_id) { + GTEST_CHECK_(thread_id != 0) << ::GetLastError(); + std::vector > value_holders; + // Clean up the ThreadIdToThreadLocals data structure while holding the + // lock, but defer the destruction of the ThreadLocalValueHolderBases. + { + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + ThreadIdToThreadLocals::iterator thread_local_pos = + thread_to_thread_locals->find(thread_id); + if (thread_local_pos != thread_to_thread_locals->end()) { + ThreadLocalValues& thread_local_values = thread_local_pos->second; + for (ThreadLocalValues::iterator value_pos = + thread_local_values.begin(); + value_pos != thread_local_values.end(); + ++value_pos) { + value_holders.push_back(value_pos->second); + } + thread_to_thread_locals->erase(thread_local_pos); + } + } + // Outside the lock, let the destructor for 'value_holders' deallocate the + // ThreadLocalValueHolderBases. + } + + private: + // In a particular thread, maps a ThreadLocal object to its value. + typedef std::map > ThreadLocalValues; + // Stores all ThreadIdToThreadLocals having values in a thread, indexed by + // thread's ID. + typedef std::map ThreadIdToThreadLocals; + + // Holds the thread id and thread handle that we pass from + // StartWatcherThreadFor to WatcherThreadFunc. + typedef std::pair ThreadIdAndHandle; + + static void StartWatcherThreadFor(DWORD thread_id) { + // The returned handle will be kept in thread_map and closed by + // watcher_thread in WatcherThreadFunc. + HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, + FALSE, + thread_id); + GTEST_CHECK_(thread != NULL); + // We need to to pass a valid thread ID pointer into CreateThread for it + // to work correctly under Win98. + DWORD watcher_thread_id; + HANDLE watcher_thread = ::CreateThread( + NULL, // Default security. + 0, // Default stack size + &ThreadLocalRegistryImpl::WatcherThreadFunc, + reinterpret_cast(new ThreadIdAndHandle(thread_id, thread)), + CREATE_SUSPENDED, + &watcher_thread_id); + GTEST_CHECK_(watcher_thread != NULL); + // Give the watcher thread the same priority as ours to avoid being + // blocked by it. + ::SetThreadPriority(watcher_thread, + ::GetThreadPriority(::GetCurrentThread())); + ::ResumeThread(watcher_thread); + ::CloseHandle(watcher_thread); + } + + // Monitors exit from a given thread and notifies those + // ThreadIdToThreadLocals about thread termination. + static DWORD WINAPI WatcherThreadFunc(LPVOID param) { + const ThreadIdAndHandle* tah = + reinterpret_cast(param); + GTEST_CHECK_( + ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0); + OnThreadExit(tah->first); + ::CloseHandle(tah->second); + delete tah; + return 0; + } + + // Returns map of thread local instances. + static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() { + mutex_.AssertHeld(); + static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals; + return map; + } + + // Protects access to GetThreadLocalsMapLocked() and its return value. + static Mutex mutex_; + // Protects access to GetThreadMapLocked() and its return value. + static Mutex thread_map_mutex_; +}; + +Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); +Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex); + +ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance) { + return ThreadLocalRegistryImpl::GetValueOnCurrentThread( + thread_local_instance); +} + +void ThreadLocalRegistry::OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance) { + ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance); +} + +#endif // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS + +#if GTEST_USES_POSIX_RE + +// Implements RE. Currently only needed for death tests. + +RE::~RE() { + if (is_valid_) { + // regfree'ing an invalid regex might crash because the content + // of the regex is undefined. Since the regex's are essentially + // the same, one cannot be valid (or invalid) without the other + // being so too. + regfree(&partial_regex_); + regfree(&full_regex_); + } + free(const_cast(pattern_)); +} + +// Returns true iff regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.full_regex_, str, 1, &match, 0) == 0; +} + +// Returns true iff regular expression re matches a substring of str +// (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.partial_regex_, str, 1, &match, 0) == 0; +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = posix::StrDup(regex); + + // Reserves enough bytes to hold the regular expression used for a + // full match. + const size_t full_regex_len = strlen(regex) + 10; + char* const full_pattern = new char[full_regex_len]; + + snprintf(full_pattern, full_regex_len, "^(%s)$", regex); + is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0; + // We want to call regcomp(&partial_regex_, ...) even if the + // previous expression returns false. Otherwise partial_regex_ may + // not be properly initialized can may cause trouble when it's + // freed. + // + // Some implementation of POSIX regex (e.g. on at least some + // versions of Cygwin) doesn't accept the empty string as a valid + // regex. We change it to an equivalent form "()" to be safe. + if (is_valid_) { + const char* const partial_regex = (*regex == '\0') ? "()" : regex; + is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0; + } + EXPECT_TRUE(is_valid_) + << "Regular expression \"" << regex + << "\" is not a valid POSIX Extended regular expression."; + + delete[] full_pattern; +} + +#elif GTEST_USES_SIMPLE_RE + +// Returns true iff ch appears anywhere in str (excluding the +// terminating '\0' character). +bool IsInSet(char ch, const char* str) { + return ch != '\0' && strchr(str, ch) != NULL; +} + +// Returns true iff ch belongs to the given classification. Unlike +// similar functions in , these aren't affected by the +// current locale. +bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } +bool IsAsciiPunct(char ch) { + return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"); +} +bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } +bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } +bool IsAsciiWordChar(char ch) { + return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || + ('0' <= ch && ch <= '9') || ch == '_'; +} + +// Returns true iff "\\c" is a supported escape sequence. +bool IsValidEscape(char c) { + return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); +} + +// Returns true iff the given atom (specified by escaped and pattern) +// matches ch. The result is undefined if the atom is invalid. +bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { + if (escaped) { // "\\p" where p is pattern_char. + switch (pattern_char) { + case 'd': return IsAsciiDigit(ch); + case 'D': return !IsAsciiDigit(ch); + case 'f': return ch == '\f'; + case 'n': return ch == '\n'; + case 'r': return ch == '\r'; + case 's': return IsAsciiWhiteSpace(ch); + case 'S': return !IsAsciiWhiteSpace(ch); + case 't': return ch == '\t'; + case 'v': return ch == '\v'; + case 'w': return IsAsciiWordChar(ch); + case 'W': return !IsAsciiWordChar(ch); + } + return IsAsciiPunct(pattern_char) && pattern_char == ch; + } + + return (pattern_char == '.' && ch != '\n') || pattern_char == ch; +} + +// Helper function used by ValidateRegex() to format error messages. +std::string FormatRegexSyntaxError(const char* regex, int index) { + return (Message() << "Syntax error at index " << index + << " in simple regular expression \"" << regex << "\": ").GetString(); +} + +// Generates non-fatal failures and returns false if regex is invalid; +// otherwise returns true. +bool ValidateRegex(const char* regex) { + if (regex == NULL) { + // TODO(wan@google.com): fix the source file location in the + // assertion failures to match where the regex is used in user + // code. + ADD_FAILURE() << "NULL is not a valid simple regular expression."; + return false; + } + + bool is_valid = true; + + // True iff ?, *, or + can follow the previous atom. + bool prev_repeatable = false; + for (int i = 0; regex[i]; i++) { + if (regex[i] == '\\') { // An escape sequence + i++; + if (regex[i] == '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "'\\' cannot appear at the end."; + return false; + } + + if (!IsValidEscape(regex[i])) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "invalid escape sequence \"\\" << regex[i] << "\"."; + is_valid = false; + } + prev_repeatable = true; + } else { // Not an escape sequence. + const char ch = regex[i]; + + if (ch == '^' && i > 0) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'^' can only appear at the beginning."; + is_valid = false; + } else if (ch == '$' && regex[i + 1] != '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'$' can only appear at the end."; + is_valid = false; + } else if (IsInSet(ch, "()[]{}|")) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'" << ch << "' is unsupported."; + is_valid = false; + } else if (IsRepeat(ch) && !prev_repeatable) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'" << ch << "' can only follow a repeatable token."; + is_valid = false; + } + + prev_repeatable = !IsInSet(ch, "^$?*+"); + } + } + + return is_valid; +} + +// Matches a repeated regex atom followed by a valid simple regular +// expression. The regex atom is defined as c if escaped is false, +// or \c otherwise. repeat is the repetition meta character (?, *, +// or +). The behavior is undefined if str contains too many +// characters to be indexable by size_t, in which case the test will +// probably time out anyway. We are fine with this limitation as +// std::string has it too. +bool MatchRepetitionAndRegexAtHead( + bool escaped, char c, char repeat, const char* regex, + const char* str) { + const size_t min_count = (repeat == '+') ? 1 : 0; + const size_t max_count = (repeat == '?') ? 1 : + static_cast(-1) - 1; + // We cannot call numeric_limits::max() as it conflicts with the + // max() macro on Windows. + + for (size_t i = 0; i <= max_count; ++i) { + // We know that the atom matches each of the first i characters in str. + if (i >= min_count && MatchRegexAtHead(regex, str + i)) { + // We have enough matches at the head, and the tail matches too. + // Since we only care about *whether* the pattern matches str + // (as opposed to *how* it matches), there is no need to find a + // greedy match. + return true; + } + if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) + return false; + } + return false; +} + +// Returns true iff regex matches a prefix of str. regex must be a +// valid simple regular expression and not start with "^", or the +// result is undefined. +bool MatchRegexAtHead(const char* regex, const char* str) { + if (*regex == '\0') // An empty regex matches a prefix of anything. + return true; + + // "$" only matches the end of a string. Note that regex being + // valid guarantees that there's nothing after "$" in it. + if (*regex == '$') + return *str == '\0'; + + // Is the first thing in regex an escape sequence? + const bool escaped = *regex == '\\'; + if (escaped) + ++regex; + if (IsRepeat(regex[1])) { + // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so + // here's an indirect recursion. It terminates as the regex gets + // shorter in each recursion. + return MatchRepetitionAndRegexAtHead( + escaped, regex[0], regex[1], regex + 2, str); + } else { + // regex isn't empty, isn't "$", and doesn't start with a + // repetition. We match the first atom of regex with the first + // character of str and recurse. + return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && + MatchRegexAtHead(regex + 1, str + 1); + } +} + +// Returns true iff regex matches any substring of str. regex must be +// a valid simple regular expression, or the result is undefined. +// +// The algorithm is recursive, but the recursion depth doesn't exceed +// the regex length, so we won't need to worry about running out of +// stack space normally. In rare cases the time complexity can be +// exponential with respect to the regex length + the string length, +// but usually it's must faster (often close to linear). +bool MatchRegexAnywhere(const char* regex, const char* str) { + if (regex == NULL || str == NULL) + return false; + + if (*regex == '^') + return MatchRegexAtHead(regex + 1, str); + + // A successful match can be anywhere in str. + do { + if (MatchRegexAtHead(regex, str)) + return true; + } while (*str++ != '\0'); + return false; +} + +// Implements the RE class. + +RE::~RE() { + free(const_cast(pattern_)); + free(const_cast(full_pattern_)); +} + +// Returns true iff regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); +} + +// Returns true iff regular expression re matches a substring of str +// (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = full_pattern_ = NULL; + if (regex != NULL) { + pattern_ = posix::StrDup(regex); + } + + is_valid_ = ValidateRegex(regex); + if (!is_valid_) { + // No need to calculate the full pattern when the regex is invalid. + return; + } + + const size_t len = strlen(regex); + // Reserves enough bytes to hold the regular expression used for a + // full match: we need space to prepend a '^', append a '$', and + // terminate the string with '\0'. + char* buffer = static_cast(malloc(len + 3)); + full_pattern_ = buffer; + + if (*regex != '^') + *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'. + + // We don't use snprintf or strncpy, as they trigger a warning when + // compiled with VC++ 8.0. + memcpy(buffer, regex, len); + buffer += len; + + if (len == 0 || regex[len - 1] != '$') + *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'. + + *buffer = '\0'; +} + +#endif // GTEST_USES_POSIX_RE + +const char kUnknownFile[] = "unknown file"; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { + const std::string file_name(file == NULL ? kUnknownFile : file); + + if (line < 0) { + return file_name + ":"; + } +#ifdef _MSC_VER + return file_name + "(" + StreamableToString(line) + "):"; +#else + return file_name + ":" + StreamableToString(line) + ":"; +#endif // _MSC_VER +} + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +// Note that FormatCompilerIndependentFileLocation() does NOT append colon +// to the file location it produces, unlike FormatFileLocation(). +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( + const char* file, int line) { + const std::string file_name(file == NULL ? kUnknownFile : file); + + if (line < 0) + return file_name; + else + return file_name + ":" + StreamableToString(line); +} + +GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) + : severity_(severity) { + const char* const marker = + severity == GTEST_INFO ? "[ INFO ]" : + severity == GTEST_WARNING ? "[WARNING]" : + severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]"; + GetStream() << ::std::endl << marker << " " + << FormatFileLocation(file, line).c_str() << ": "; +} + +// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. +GTestLog::~GTestLog() { + GetStream() << ::std::endl; + if (severity_ == GTEST_FATAL) { + fflush(stderr); + posix::Abort(); + } +} +// Disable Microsoft deprecation warnings for POSIX functions called from +// this class (creat, dup, dup2, and close) +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) + +#if GTEST_HAS_STREAM_REDIRECTION + +// Object that captures an output stream (stdout/stderr). +class CapturedStream { + public: + // The ctor redirects the stream to a temporary file. + explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { +# if GTEST_OS_WINDOWS + char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT + char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT + + ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); + const UINT success = ::GetTempFileNameA(temp_dir_path, + "gtest_redir", + 0, // Generate unique file name. + temp_file_path); + GTEST_CHECK_(success != 0) + << "Unable to create a temporary file in " << temp_dir_path; + const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); + GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file " + << temp_file_path; + filename_ = temp_file_path; +# else + // There's no guarantee that a test has write access to the current + // directory, so we create the temporary file in the /tmp directory + // instead. We use /tmp on most systems, and /sdcard on Android. + // That's because Android doesn't have /tmp. +# if GTEST_OS_LINUX_ANDROID + // Note: Android applications are expected to call the framework's + // Context.getExternalStorageDirectory() method through JNI to get + // the location of the world-writable SD Card directory. However, + // this requires a Context handle, which cannot be retrieved + // globally from native code. Doing so also precludes running the + // code as part of a regular standalone executable, which doesn't + // run in a Dalvik process (e.g. when running it through 'adb shell'). + // + // The location /sdcard is directly accessible from native code + // and is the only location (unofficially) supported by the Android + // team. It's generally a symlink to the real SD Card mount point + // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or + // other OEM-customized locations. Never rely on these, and always + // use /sdcard. + char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX"; +# else + char name_template[] = "/tmp/captured_stream.XXXXXX"; +# endif // GTEST_OS_LINUX_ANDROID + const int captured_fd = mkstemp(name_template); + filename_ = name_template; +# endif // GTEST_OS_WINDOWS + fflush(NULL); + dup2(captured_fd, fd_); + close(captured_fd); + } + + ~CapturedStream() { + remove(filename_.c_str()); + } + + std::string GetCapturedString() { + if (uncaptured_fd_ != -1) { + // Restores the original stream. + fflush(NULL); + dup2(uncaptured_fd_, fd_); + close(uncaptured_fd_); + uncaptured_fd_ = -1; + } + + FILE* const file = posix::FOpen(filename_.c_str(), "r"); + const std::string content = ReadEntireFile(file); + posix::FClose(file); + return content; + } + + private: + const int fd_; // A stream to capture. + int uncaptured_fd_; + // Name of the temporary file holding the stderr output. + ::std::string filename_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() + +static CapturedStream* g_captured_stderr = NULL; +static CapturedStream* g_captured_stdout = NULL; + +// Starts capturing an output stream (stdout/stderr). +void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) { + if (*stream != NULL) { + GTEST_LOG_(FATAL) << "Only one " << stream_name + << " capturer can exist at a time."; + } + *stream = new CapturedStream(fd); +} + +// Stops capturing the output stream and returns the captured string. +std::string GetCapturedStream(CapturedStream** captured_stream) { + const std::string content = (*captured_stream)->GetCapturedString(); + + delete *captured_stream; + *captured_stream = NULL; + + return content; +} + +// Starts capturing stdout. +void CaptureStdout() { + CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); +} + +// Starts capturing stderr. +void CaptureStderr() { + CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr); +} + +// Stops capturing stdout and returns the captured string. +std::string GetCapturedStdout() { + return GetCapturedStream(&g_captured_stdout); +} + +// Stops capturing stderr and returns the captured string. +std::string GetCapturedStderr() { + return GetCapturedStream(&g_captured_stderr); +} + +#endif // GTEST_HAS_STREAM_REDIRECTION + +std::string TempDir() { +#if GTEST_OS_WINDOWS_MOBILE + return "\\temp\\"; +#elif GTEST_OS_WINDOWS + const char* temp_dir = posix::GetEnv("TEMP"); + if (temp_dir == NULL || temp_dir[0] == '\0') + return "\\temp\\"; + else if (temp_dir[strlen(temp_dir) - 1] == '\\') + return temp_dir; + else + return std::string(temp_dir) + "\\"; +#elif GTEST_OS_LINUX_ANDROID + return "/sdcard/"; +#else + return "/tmp/"; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +size_t GetFileSize(FILE* file) { + fseek(file, 0, SEEK_END); + return static_cast(ftell(file)); +} + +std::string ReadEntireFile(FILE* file) { + const size_t file_size = GetFileSize(file); + char* const buffer = new char[file_size]; + + size_t bytes_last_read = 0; // # of bytes read in the last fread() + size_t bytes_read = 0; // # of bytes read so far + + fseek(file, 0, SEEK_SET); + + // Keeps reading the file until we cannot read further or the + // pre-determined file size is reached. + do { + bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file); + bytes_read += bytes_last_read; + } while (bytes_last_read > 0 && bytes_read < file_size); + + const std::string content(buffer, bytes_read); + delete[] buffer; + + return content; +} + +#if GTEST_HAS_DEATH_TEST + +static const ::std::vector* g_injected_test_argvs = + NULL; // Owned. + +void SetInjectableArgvs(const ::std::vector* argvs) { + if (g_injected_test_argvs != argvs) + delete g_injected_test_argvs; + g_injected_test_argvs = argvs; +} + +const ::std::vector& GetInjectableArgvs() { + if (g_injected_test_argvs != NULL) { + return *g_injected_test_argvs; + } + return GetArgvs(); +} +#endif // GTEST_HAS_DEATH_TEST + +#if GTEST_OS_WINDOWS_MOBILE +namespace posix { +void Abort() { + DebugBreak(); + TerminateProcess(GetCurrentProcess(), 1); +} +} // namespace posix +#endif // GTEST_OS_WINDOWS_MOBILE + +// Returns the name of the environment variable corresponding to the +// given flag. For example, FlagToEnvVar("foo") will return +// "GTEST_FOO" in the open-source version. +static std::string FlagToEnvVar(const char* flag) { + const std::string full_flag = + (Message() << GTEST_FLAG_PREFIX_ << flag).GetString(); + + Message env_var; + for (size_t i = 0; i != full_flag.length(); i++) { + env_var << ToUpper(full_flag.c_str()[i]); + } + + return env_var.GetString(); +} + +// Parses 'str' for a 32-bit signed integer. If successful, writes +// the result to *value and returns true; otherwise leaves *value +// unchanged and returns false. +bool ParseInt32(const Message& src_text, const char* str, Int32* value) { + // Parses the environment variable as a decimal integer. + char* end = NULL; + const long long_value = strtol(str, &end, 10); // NOLINT + + // Has strtol() consumed all characters in the string? + if (*end != '\0') { + // No - an invalid character was encountered. + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value \"" << str << "\".\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + // Is the parsed value in the range of an Int32? + const Int32 result = static_cast(long_value); + if (long_value == LONG_MAX || long_value == LONG_MIN || + // The parsed value overflows as a long. (strtol() returns + // LONG_MAX or LONG_MIN when the input overflows.) + result != long_value + // The parsed value overflows as an Int32. + ) { + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value " << str << ", which overflows.\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + *value = result; + return true; +} + +// Reads and returns the Boolean environment variable corresponding to +// the given flag; if it's not set, returns default_value. +// +// The value is considered true iff it's not "0". +bool BoolFromGTestEnv(const char* flag, bool default_value) { +#if defined(GTEST_GET_BOOL_FROM_ENV_) + return GTEST_GET_BOOL_FROM_ENV_(flag, default_value); +#endif // defined(GTEST_GET_BOOL_FROM_ENV_) + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + return string_value == NULL ? + default_value : strcmp(string_value, "0") != 0; +} + +// Reads and returns a 32-bit integer stored in the environment +// variable corresponding to the given flag; if it isn't set or +// doesn't represent a valid 32-bit integer, returns default_value. +Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { +#if defined(GTEST_GET_INT32_FROM_ENV_) + return GTEST_GET_INT32_FROM_ENV_(flag, default_value); +#endif // defined(GTEST_GET_INT32_FROM_ENV_) + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + if (string_value == NULL) { + // The environment variable is not set. + return default_value; + } + + Int32 result = default_value; + if (!ParseInt32(Message() << "Environment variable " << env_var, + string_value, &result)) { + printf("The default value %s is used.\n", + (Message() << default_value).GetString().c_str()); + fflush(stdout); + return default_value; + } + + return result; +} + +// Reads and returns the string environment variable corresponding to +// the given flag; if it's not set, returns default_value. +std::string StringFromGTestEnv(const char* flag, const char* default_value) { +#if defined(GTEST_GET_STRING_FROM_ENV_) + return GTEST_GET_STRING_FROM_ENV_(flag, default_value); +#endif // defined(GTEST_GET_STRING_FROM_ENV_) + const std::string env_var = FlagToEnvVar(flag); + const char* value = posix::GetEnv(env_var.c_str()); + if (value != NULL) { + return value; + } + + // As a special case for the 'output' flag, if GTEST_OUTPUT is not + // set, we look for XML_OUTPUT_FILE, which is set by the Bazel build + // system. The value of XML_OUTPUT_FILE is a filename without the + // "xml:" prefix of GTEST_OUTPUT. + // + // The net priority order after flag processing is thus: + // --gtest_output command line flag + // GTEST_OUTPUT environment variable + // XML_OUTPUT_FILE environment variable + // 'default_value' + if (strcmp(flag, "output") == 0) { + value = posix::GetEnv("XML_OUTPUT_FILE"); + if (value != NULL) { + return std::string("xml:") + value; + } + } + return default_value; +} + +} // namespace internal +} // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc b/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc new file mode 100644 index 0000000000..a2df412f8a --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc @@ -0,0 +1,373 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Google Test - The Google C++ Testing Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// It uses the << operator when possible, and prints the bytes in the +// object otherwise. A user can override its behavior for a class +// type Foo by defining either operator<<(::std::ostream&, const Foo&) +// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that +// defines Foo. + +#include "gtest/gtest-printers.h" +#include +#include +#include +#include // NOLINT +#include +#include "gtest/internal/gtest-port.h" + +namespace testing { + +namespace { + +using ::std::ostream; + +// Prints a segment of bytes in the given object. +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start, + size_t count, ostream* os) { + char text[5] = ""; + for (size_t i = 0; i != count; i++) { + const size_t j = start + i; + if (i != 0) { + // Organizes the bytes into groups of 2 for easy parsing by + // human. + if ((j % 2) == 0) + *os << ' '; + else + *os << '-'; + } + GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]); + *os << text; + } +} + +// Prints the bytes in the given value to the given ostream. +void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, + ostream* os) { + // Tells the user how big the object is. + *os << count << "-byte object <"; + + const size_t kThreshold = 132; + const size_t kChunkSize = 64; + // If the object size is bigger than kThreshold, we'll have to omit + // some details by printing only the first and the last kChunkSize + // bytes. + // TODO(wan): let the user control the threshold using a flag. + if (count < kThreshold) { + PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); + } else { + PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); + *os << " ... "; + // Rounds up to 2-byte boundary. + const size_t resume_pos = (count - kChunkSize + 1)/2*2; + PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); + } + *os << ">"; +} + +} // namespace + +namespace internal2 { + +// Delegates to PrintBytesInObjectToImpl() to print the bytes in the +// given object. The delegation simplifies the implementation, which +// uses the << operator and thus is easier done outside of the +// ::testing::internal namespace, which contains a << operator that +// sometimes conflicts with the one in STL. +void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, + ostream* os) { + PrintBytesInObjectToImpl(obj_bytes, count, os); +} + +} // namespace internal2 + +namespace internal { + +// Depending on the value of a char (or wchar_t), we print it in one +// of three formats: +// - as is if it's a printable ASCII (e.g. 'a', '2', ' '), +// - as a hexidecimal escape sequence (e.g. '\x7F'), or +// - as a special escape sequence (e.g. '\r', '\n'). +enum CharFormat { + kAsIs, + kHexEscape, + kSpecialEscape +}; + +// Returns true if c is a printable ASCII character. We test the +// value of c directly instead of calling isprint(), which is buggy on +// Windows Mobile. +inline bool IsPrintableAscii(wchar_t c) { + return 0x20 <= c && c <= 0x7E; +} + +// Prints a wide or narrow char c as a character literal without the +// quotes, escaping it when necessary; returns how c was formatted. +// The template argument UnsignedChar is the unsigned version of Char, +// which is the type of c. +template +static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { + switch (static_cast(c)) { + case L'\0': + *os << "\\0"; + break; + case L'\'': + *os << "\\'"; + break; + case L'\\': + *os << "\\\\"; + break; + case L'\a': + *os << "\\a"; + break; + case L'\b': + *os << "\\b"; + break; + case L'\f': + *os << "\\f"; + break; + case L'\n': + *os << "\\n"; + break; + case L'\r': + *os << "\\r"; + break; + case L'\t': + *os << "\\t"; + break; + case L'\v': + *os << "\\v"; + break; + default: + if (IsPrintableAscii(c)) { + *os << static_cast(c); + return kAsIs; + } else { + *os << "\\x" + String::FormatHexInt(static_cast(c)); + return kHexEscape; + } + } + return kSpecialEscape; +} + +// Prints a wchar_t c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { + switch (c) { + case L'\'': + *os << "'"; + return kAsIs; + case L'"': + *os << "\\\""; + return kSpecialEscape; + default: + return PrintAsCharLiteralTo(c, os); + } +} + +// Prints a char c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(char c, ostream* os) { + return PrintAsStringLiteralTo( + static_cast(static_cast(c)), os); +} + +// Prints a wide or narrow character c and its code. '\0' is printed +// as "'\\0'", other unprintable characters are also properly escaped +// using the standard C++ escape sequence. The template argument +// UnsignedChar is the unsigned version of Char, which is the type of c. +template +void PrintCharAndCodeTo(Char c, ostream* os) { + // First, print c as a literal in the most readable form we can find. + *os << ((sizeof(c) > 1) ? "L'" : "'"); + const CharFormat format = PrintAsCharLiteralTo(c, os); + *os << "'"; + + // To aid user debugging, we also print c's code in decimal, unless + // it's 0 (in which case c was printed as '\\0', making the code + // obvious). + if (c == 0) + return; + *os << " (" << static_cast(c); + + // For more convenience, we print c's code again in hexidecimal, + // unless c was already printed in the form '\x##' or the code is in + // [1, 9]. + if (format == kHexEscape || (1 <= c && c <= 9)) { + // Do nothing. + } else { + *os << ", 0x" << String::FormatHexInt(static_cast(c)); + } + *os << ")"; +} + +void PrintTo(unsigned char c, ::std::ostream* os) { + PrintCharAndCodeTo(c, os); +} +void PrintTo(signed char c, ::std::ostream* os) { + PrintCharAndCodeTo(c, os); +} + +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its code. L'\0' is printed as "L'\\0'". +void PrintTo(wchar_t wc, ostream* os) { + PrintCharAndCodeTo(wc, os); +} + +// Prints the given array of characters to the ostream. CharType must be either +// char or wchar_t. +// The array starts at begin, the length is len, it may include '\0' characters +// and may not be NUL-terminated. +template +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +static void PrintCharsAsStringTo( + const CharType* begin, size_t len, ostream* os) { + const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\""; + *os << kQuoteBegin; + bool is_previous_hex = false; + for (size_t index = 0; index < len; ++index) { + const CharType cur = begin[index]; + if (is_previous_hex && IsXDigit(cur)) { + // Previous character is of '\x..' form and this character can be + // interpreted as another hexadecimal digit in its number. Break string to + // disambiguate. + *os << "\" " << kQuoteBegin; + } + is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; + } + *os << "\""; +} + +// Prints a (const) char/wchar_t array of 'len' elements, starting at address +// 'begin'. CharType must be either char or wchar_t. +template +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +static void UniversalPrintCharArray( + const CharType* begin, size_t len, ostream* os) { + // The code + // const char kFoo[] = "foo"; + // generates an array of 4, not 3, elements, with the last one being '\0'. + // + // Therefore when printing a char array, we don't print the last element if + // it's '\0', such that the output matches the string literal as it's + // written in the source code. + if (len > 0 && begin[len - 1] == '\0') { + PrintCharsAsStringTo(begin, len - 1, os); + return; + } + + // If, however, the last element in the array is not '\0', e.g. + // const char kFoo[] = { 'f', 'o', 'o' }; + // we must print the entire array. We also print a message to indicate + // that the array is not NUL-terminated. + PrintCharsAsStringTo(begin, len, os); + *os << " (no terminating NUL)"; +} + +// Prints a (const) char array of 'len' elements, starting at address 'begin'. +void UniversalPrintArray(const char* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints a (const) wchar_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints the given C string to the ostream. +void PrintTo(const char* s, ostream* os) { + if (s == NULL) { + *os << "NULL"; + } else { + *os << ImplicitCast_(s) << " pointing to "; + PrintCharsAsStringTo(s, strlen(s), os); + } +} + +// MSVC compiler can be configured to define whar_t as a typedef +// of unsigned short. Defining an overload for const wchar_t* in that case +// would cause pointers to unsigned shorts be printed as wide strings, +// possibly accessing more memory than intended and causing invalid +// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when +// wchar_t is implemented as a native type. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Prints the given wide C string to the ostream. +void PrintTo(const wchar_t* s, ostream* os) { + if (s == NULL) { + *os << "NULL"; + } else { + *os << ImplicitCast_(s) << " pointing to "; + PrintCharsAsStringTo(s, std::wcslen(s), os); + } +} +#endif // wchar_t is native + +// Prints a ::string object. +#if GTEST_HAS_GLOBAL_STRING +void PrintStringTo(const ::string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_GLOBAL_STRING + +void PrintStringTo(const ::std::string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + +// Prints a ::wstring object. +#if GTEST_HAS_GLOBAL_WSTRING +void PrintWideStringTo(const ::wstring& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +#if GTEST_HAS_STD_WSTRING +void PrintWideStringTo(const ::std::wstring& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_STD_WSTRING + +} // namespace internal + +} // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc new file mode 100644 index 0000000000..fb0e35425e --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc @@ -0,0 +1,110 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: mheule@google.com (Markus Heule) +// +// The Google C++ Testing Framework (Google Test) + +#include "gtest/gtest-test-part.h" + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick exists to +// prevent the accidental inclusion of gtest-internal-inl.h in the +// user's code. +#define GTEST_IMPLEMENTATION_ 1 +#include "src/gtest-internal-inl.h" +#undef GTEST_IMPLEMENTATION_ + +namespace testing { + +using internal::GetUnitTestImpl; + +// Gets the summary of the failure message by omitting the stack trace +// in it. +std::string TestPartResult::ExtractSummary(const char* message) { + const char* const stack_trace = strstr(message, internal::kStackTraceMarker); + return stack_trace == NULL ? message : + std::string(message, stack_trace); +} + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { + return os + << result.file_name() << ":" << result.line_number() << ": " + << (result.type() == TestPartResult::kSuccess ? "Success" : + result.type() == TestPartResult::kFatalFailure ? "Fatal failure" : + "Non-fatal failure") << ":\n" + << result.message() << std::endl; +} + +// Appends a TestPartResult to the array. +void TestPartResultArray::Append(const TestPartResult& result) { + array_.push_back(result); +} + +// Returns the TestPartResult at the given index (0-based). +const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const { + if (index < 0 || index >= size()) { + printf("\nInvalid index (%d) into TestPartResultArray.\n", index); + internal::posix::Abort(); + } + + return array_[index]; +} + +// Returns the number of TestPartResult objects in the array. +int TestPartResultArray::size() const { + return static_cast(array_.size()); +} + +namespace internal { + +HasNewFatalFailureHelper::HasNewFatalFailureHelper() + : has_new_fatal_failure_(false), + original_reporter_(GetUnitTestImpl()-> + GetTestPartResultReporterForCurrentThread()) { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); +} + +HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread( + original_reporter_); +} + +void HasNewFatalFailureHelper::ReportTestPartResult( + const TestPartResult& result) { + if (result.fatally_failed()) + has_new_fatal_failure_ = true; + original_reporter_->ReportTestPartResult(result); +} + +} // namespace internal + +} // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc new file mode 100644 index 0000000000..df1eef4754 --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc @@ -0,0 +1,118 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +#include "gtest/gtest-typed-test.h" +#include "gtest/gtest.h" + +namespace testing { +namespace internal { + +#if GTEST_HAS_TYPED_TEST_P + +// Skips to the first non-space char in str. Returns an empty string if str +// contains only whitespace characters. +static const char* SkipSpaces(const char* str) { + while (IsSpace(*str)) + str++; + return str; +} + +static std::vector SplitIntoTestNames(const char* src) { + std::vector name_vec; + src = SkipSpaces(src); + for (; src != NULL; src = SkipComma(src)) { + name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src))); + } + return name_vec; +} + +// Verifies that registered_tests match the test names in +// registered_tests_; returns registered_tests if successful, or +// aborts the program otherwise. +const char* TypedTestCasePState::VerifyRegisteredTestNames( + const char* file, int line, const char* registered_tests) { + typedef RegisteredTestsMap::const_iterator RegisteredTestIter; + registered_ = true; + + std::vector name_vec = SplitIntoTestNames(registered_tests); + + Message errors; + + std::set tests; + for (std::vector::const_iterator name_it = name_vec.begin(); + name_it != name_vec.end(); ++name_it) { + const std::string& name = *name_it; + if (tests.count(name) != 0) { + errors << "Test " << name << " is listed more than once.\n"; + continue; + } + + bool found = false; + for (RegisteredTestIter it = registered_tests_.begin(); + it != registered_tests_.end(); + ++it) { + if (name == it->first) { + found = true; + break; + } + } + + if (found) { + tests.insert(name); + } else { + errors << "No test named " << name + << " can be found in this test case.\n"; + } + } + + for (RegisteredTestIter it = registered_tests_.begin(); + it != registered_tests_.end(); + ++it) { + if (tests.count(it->first) == 0) { + errors << "You forgot to list test " << it->first << ".\n"; + } + } + + const std::string& errors_str = errors.GetString(); + if (errors_str != "") { + fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), + errors_str.c_str()); + fflush(stderr); + posix::Abort(); + } + + return registered_tests; +} + +#endif // GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing diff --git a/libs/libvpx/third_party/googletest/src/src/gtest.cc b/libs/libvpx/third_party/googletest/src/src/gtest.cc new file mode 100644 index 0000000000..5a8932c73e --- /dev/null +++ b/libs/libvpx/third_party/googletest/src/src/gtest.cc @@ -0,0 +1,5389 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) + +#include "gtest/gtest.h" +#include "gtest/internal/custom/gtest.h" +#include "gtest/gtest-spi.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include + +#if GTEST_OS_LINUX + +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +# define GTEST_HAS_GETTIMEOFDAY_ 1 + +# include // NOLINT +# include // NOLINT +# include // NOLINT +// Declares vsnprintf(). This header is not available on Windows. +# include // NOLINT +# include // NOLINT +# include // NOLINT +# include // NOLINT +# include + +#elif GTEST_OS_SYMBIAN +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include // NOLINT + +#elif GTEST_OS_ZOS +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include // NOLINT + +// On z/OS we additionally need strings.h for strcasecmp. +# include // NOLINT + +#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. + +# include // NOLINT +# undef min + +#elif GTEST_OS_WINDOWS // We are on Windows proper. + +# include // NOLINT +# include // NOLINT +# include // NOLINT +# include // NOLINT + +# if GTEST_OS_WINDOWS_MINGW +// MinGW has gettimeofday() but not _ftime64(). +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +// TODO(kenton@google.com): There are other ways to get the time on +// Windows, like GetTickCount() or GetSystemTimeAsFileTime(). MinGW +// supports these. consider using them instead. +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include // NOLINT +# endif // GTEST_OS_WINDOWS_MINGW + +// cpplint thinks that the header is already included, so we want to +// silence it. +# include // NOLINT +# undef min + +#else + +// Assume other platforms have gettimeofday(). +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +# define GTEST_HAS_GETTIMEOFDAY_ 1 + +// cpplint thinks that the header is already included, so we want to +// silence it. +# include // NOLINT +# include // NOLINT + +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +# include +#endif + +#if GTEST_CAN_STREAM_RESULTS_ +# include // NOLINT +# include // NOLINT +# include // NOLINT +# include // NOLINT +#endif + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +#include "src/gtest-internal-inl.h" +#undef GTEST_IMPLEMENTATION_ + +#if GTEST_OS_WINDOWS +# define vsnprintf _vsnprintf +#endif // GTEST_OS_WINDOWS + +namespace testing { + +using internal::CountIf; +using internal::ForEach; +using internal::GetElementOr; +using internal::Shuffle; + +// Constants. + +// A test whose test case name or test name matches this filter is +// disabled and not run. +static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; + +// A test case whose name matches this filter is considered a death +// test case and will be run before test cases whose name doesn't +// match this filter. +static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*"; + +// A test filter that matches everything. +static const char kUniversalFilter[] = "*"; + +// The default output file for XML output. +static const char kDefaultOutputFile[] = "test_detail.xml"; + +// The environment variable name for the test shard index. +static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; +// The environment variable name for the total number of test shards. +static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS"; +// The environment variable name for the test shard status file. +static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE"; + +namespace internal { + +// The text used in failure messages to indicate the start of the +// stack trace. +const char kStackTraceMarker[] = "\nStack trace:\n"; + +// g_help_flag is true iff the --help flag or an equivalent form is +// specified on the command line. +bool g_help_flag = false; + +} // namespace internal + +static const char* GetDefaultFilter() { +#ifdef GTEST_TEST_FILTER_ENV_VAR_ + const char* const testbridge_test_only = getenv(GTEST_TEST_FILTER_ENV_VAR_); + if (testbridge_test_only != NULL) { + return testbridge_test_only; + } +#endif // GTEST_TEST_FILTER_ENV_VAR_ + return kUniversalFilter; +} + +GTEST_DEFINE_bool_( + also_run_disabled_tests, + internal::BoolFromGTestEnv("also_run_disabled_tests", false), + "Run disabled tests too, in addition to the tests normally being run."); + +GTEST_DEFINE_bool_( + break_on_failure, + internal::BoolFromGTestEnv("break_on_failure", false), + "True iff a failed assertion should be a debugger break-point."); + +GTEST_DEFINE_bool_( + catch_exceptions, + internal::BoolFromGTestEnv("catch_exceptions", true), + "True iff " GTEST_NAME_ + " should catch exceptions and treat them as test failures."); + +GTEST_DEFINE_string_( + color, + internal::StringFromGTestEnv("color", "auto"), + "Whether to use colors in the output. Valid values: yes, no, " + "and auto. 'auto' means to use colors if the output is " + "being sent to a terminal and the TERM environment variable " + "is set to a terminal type that supports colors."); + +GTEST_DEFINE_string_( + filter, + internal::StringFromGTestEnv("filter", GetDefaultFilter()), + "A colon-separated list of glob (not regex) patterns " + "for filtering the tests to run, optionally followed by a " + "'-' and a : separated list of negative patterns (tests to " + "exclude). A test is run if it matches one of the positive " + "patterns and does not match any of the negative patterns."); + +GTEST_DEFINE_bool_(list_tests, false, + "List all tests without running them."); + +GTEST_DEFINE_string_( + output, + internal::StringFromGTestEnv("output", ""), + "A format (currently must be \"xml\"), optionally followed " + "by a colon and an output file name or directory. A directory " + "is indicated by a trailing pathname separator. " + "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " + "If a directory is specified, output files will be created " + "within that directory, with file-names based on the test " + "executable's name and, if necessary, made unique by adding " + "digits."); + +GTEST_DEFINE_bool_( + print_time, + internal::BoolFromGTestEnv("print_time", true), + "True iff " GTEST_NAME_ + " should display elapsed time in text output."); + +GTEST_DEFINE_int32_( + random_seed, + internal::Int32FromGTestEnv("random_seed", 0), + "Random number seed to use when shuffling test orders. Must be in range " + "[1, 99999], or 0 to use a seed based on the current time."); + +GTEST_DEFINE_int32_( + repeat, + internal::Int32FromGTestEnv("repeat", 1), + "How many times to repeat each test. Specify a negative number " + "for repeating forever. Useful for shaking out flaky tests."); + +GTEST_DEFINE_bool_( + show_internal_stack_frames, false, + "True iff " GTEST_NAME_ " should include internal stack frames when " + "printing test failure stack traces."); + +GTEST_DEFINE_bool_( + shuffle, + internal::BoolFromGTestEnv("shuffle", false), + "True iff " GTEST_NAME_ + " should randomize tests' order on every run."); + +GTEST_DEFINE_int32_( + stack_trace_depth, + internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth), + "The maximum number of stack frames to print when an " + "assertion fails. The valid range is 0 through 100, inclusive."); + +GTEST_DEFINE_string_( + stream_result_to, + internal::StringFromGTestEnv("stream_result_to", ""), + "This flag specifies the host name and the port number on which to stream " + "test results. Example: \"localhost:555\". The flag is effective only on " + "Linux."); + +GTEST_DEFINE_bool_( + throw_on_failure, + internal::BoolFromGTestEnv("throw_on_failure", false), + "When this flag is specified, a failed assertion will throw an exception " + "if exceptions are enabled or exit the program with a non-zero code " + "otherwise."); + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DEFINE_string_( + flagfile, + internal::StringFromGTestEnv("flagfile", ""), + "This flag specifies the flagfile to read command-line flags from."); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +namespace internal { + +// Generates a random number from [0, range), using a Linear +// Congruential Generator (LCG). Crashes if 'range' is 0 or greater +// than kMaxRange. +GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ +UInt32 Random::Generate(UInt32 range) { + // These constants are the same as are used in glibc's rand(3). + state_ = (1103515245U*state_ + 12345U) % kMaxRange; + + GTEST_CHECK_(range > 0) + << "Cannot generate a number in the range [0, 0)."; + GTEST_CHECK_(range <= kMaxRange) + << "Generation of a number in [0, " << range << ") was requested, " + << "but this can only generate numbers in [0, " << kMaxRange << ")."; + + // Converting via modulus introduces a bit of downward bias, but + // it's simple, and a linear congruential generator isn't too good + // to begin with. + return state_ % range; +} + +// GTestIsInitialized() returns true iff the user has initialized +// Google Test. Useful for catching the user mistake of not initializing +// Google Test before calling RUN_ALL_TESTS(). +static bool GTestIsInitialized() { return GetArgvs().size() > 0; } + +// Iterates over a vector of TestCases, keeping a running sum of the +// results of calling a given int-returning method on each. +// Returns the sum. +static int SumOverTestCaseList(const std::vector& case_list, + int (TestCase::*method)() const) { + int sum = 0; + for (size_t i = 0; i < case_list.size(); i++) { + sum += (case_list[i]->*method)(); + } + return sum; +} + +// Returns true iff the test case passed. +static bool TestCasePassed(const TestCase* test_case) { + return test_case->should_run() && test_case->Passed(); +} + +// Returns true iff the test case failed. +static bool TestCaseFailed(const TestCase* test_case) { + return test_case->should_run() && test_case->Failed(); +} + +// Returns true iff test_case contains at least one test that should +// run. +static bool ShouldRunTestCase(const TestCase* test_case) { + return test_case->should_run(); +} + +// AssertHelper constructor. +AssertHelper::AssertHelper(TestPartResult::Type type, + const char* file, + int line, + const char* message) + : data_(new AssertHelperData(type, file, line, message)) { +} + +AssertHelper::~AssertHelper() { + delete data_; +} + +// Message assignment, for assertion streaming support. +void AssertHelper::operator=(const Message& message) const { + UnitTest::GetInstance()-> + AddTestPartResult(data_->type, data_->file, data_->line, + AppendUserMessage(data_->message, message), + UnitTest::GetInstance()->impl() + ->CurrentOsStackTraceExceptTop(1) + // Skips the stack frame for this function itself. + ); // NOLINT +} + +// Mutex for linked pointers. +GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex); + +// A copy of all command line arguments. Set by InitGoogleTest(). +::std::vector g_argvs; + +const ::std::vector& GetArgvs() { +#if defined(GTEST_CUSTOM_GET_ARGVS_) + return GTEST_CUSTOM_GET_ARGVS_(); +#else // defined(GTEST_CUSTOM_GET_ARGVS_) + return g_argvs; +#endif // defined(GTEST_CUSTOM_GET_ARGVS_) +} + +// Returns the current application's name, removing directory path if that +// is present. +FilePath GetCurrentExecutableName() { + FilePath result; + +#if GTEST_OS_WINDOWS + result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe")); +#else + result.Set(FilePath(GetArgvs()[0])); +#endif // GTEST_OS_WINDOWS + + return result.RemoveDirectoryName(); +} + +// Functions for processing the gtest_output flag. + +// Returns the output format, or "" for normal printed output. +std::string UnitTestOptions::GetOutputFormat() { + const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + if (gtest_output_flag == NULL) return std::string(""); + + const char* const colon = strchr(gtest_output_flag, ':'); + return (colon == NULL) ? + std::string(gtest_output_flag) : + std::string(gtest_output_flag, colon - gtest_output_flag); +} + +// Returns the name of the requested output file, or the default if none +// was explicitly specified. +std::string UnitTestOptions::GetAbsolutePathToOutputFile() { + const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + if (gtest_output_flag == NULL) + return ""; + + const char* const colon = strchr(gtest_output_flag, ':'); + if (colon == NULL) + return internal::FilePath::ConcatPaths( + internal::FilePath( + UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(kDefaultOutputFile)).string(); + + internal::FilePath output_name(colon + 1); + if (!output_name.IsAbsolutePath()) + // TODO(wan@google.com): on Windows \some\path is not an absolute + // path (as its meaning depends on the current drive), yet the + // following logic for turning it into an absolute path is wrong. + // Fix it. + output_name = internal::FilePath::ConcatPaths( + internal::FilePath(UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(colon + 1)); + + if (!output_name.IsDirectory()) + return output_name.string(); + + internal::FilePath result(internal::FilePath::GenerateUniqueFileName( + output_name, internal::GetCurrentExecutableName(), + GetOutputFormat().c_str())); + return result.string(); +} + +// Returns true iff the wildcard pattern matches the string. The +// first ':' or '\0' character in pattern marks the end of it. +// +// This recursive algorithm isn't very efficient, but is clear and +// works well enough for matching test names, which are short. +bool UnitTestOptions::PatternMatchesString(const char *pattern, + const char *str) { + switch (*pattern) { + case '\0': + case ':': // Either ':' or '\0' marks the end of the pattern. + return *str == '\0'; + case '?': // Matches any single character. + return *str != '\0' && PatternMatchesString(pattern + 1, str + 1); + case '*': // Matches any string (possibly empty) of characters. + return (*str != '\0' && PatternMatchesString(pattern, str + 1)) || + PatternMatchesString(pattern + 1, str); + default: // Non-special character. Matches itself. + return *pattern == *str && + PatternMatchesString(pattern + 1, str + 1); + } +} + +bool UnitTestOptions::MatchesFilter( + const std::string& name, const char* filter) { + const char *cur_pattern = filter; + for (;;) { + if (PatternMatchesString(cur_pattern, name.c_str())) { + return true; + } + + // Finds the next pattern in the filter. + cur_pattern = strchr(cur_pattern, ':'); + + // Returns if no more pattern can be found. + if (cur_pattern == NULL) { + return false; + } + + // Skips the pattern separater (the ':' character). + cur_pattern++; + } +} + +// Returns true iff the user-specified filter matches the test case +// name and the test name. +bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name, + const std::string &test_name) { + const std::string& full_name = test_case_name + "." + test_name.c_str(); + + // Split --gtest_filter at '-', if there is one, to separate into + // positive filter and negative filter portions + const char* const p = GTEST_FLAG(filter).c_str(); + const char* const dash = strchr(p, '-'); + std::string positive; + std::string negative; + if (dash == NULL) { + positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter + negative = ""; + } else { + positive = std::string(p, dash); // Everything up to the dash + negative = std::string(dash + 1); // Everything after the dash + if (positive.empty()) { + // Treat '-test1' as the same as '*-test1' + positive = kUniversalFilter; + } + } + + // A filter is a colon-separated list of patterns. It matches a + // test if any pattern in it matches the test. + return (MatchesFilter(full_name, positive.c_str()) && + !MatchesFilter(full_name, negative.c_str())); +} + +#if GTEST_HAS_SEH +// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the +// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. +// This function is useful as an __except condition. +int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { + // Google Test should handle a SEH exception if: + // 1. the user wants it to, AND + // 2. this is not a breakpoint exception, AND + // 3. this is not a C++ exception (VC++ implements them via SEH, + // apparently). + // + // SEH exception code for C++ exceptions. + // (see http://support.microsoft.com/kb/185294 for more information). + const DWORD kCxxExceptionCode = 0xe06d7363; + + bool should_handle = true; + + if (!GTEST_FLAG(catch_exceptions)) + should_handle = false; + else if (exception_code == EXCEPTION_BREAKPOINT) + should_handle = false; + else if (exception_code == kCxxExceptionCode) + should_handle = false; + + return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; +} +#endif // GTEST_HAS_SEH + +} // namespace internal + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. Intercepts only failures from the current thread. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + TestPartResultArray* result) + : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), + result_(result) { + Init(); +} + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + InterceptMode intercept_mode, TestPartResultArray* result) + : intercept_mode_(intercept_mode), + result_(result) { + Init(); +} + +void ScopedFakeTestPartResultReporter::Init() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + old_reporter_ = impl->GetGlobalTestPartResultReporter(); + impl->SetGlobalTestPartResultReporter(this); + } else { + old_reporter_ = impl->GetTestPartResultReporterForCurrentThread(); + impl->SetTestPartResultReporterForCurrentThread(this); + } +} + +// The d'tor restores the test part result reporter used by Google Test +// before. +ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + impl->SetGlobalTestPartResultReporter(old_reporter_); + } else { + impl->SetTestPartResultReporterForCurrentThread(old_reporter_); + } +} + +// Increments the test part result count and remembers the result. +// This method is from the TestPartResultReporterInterface interface. +void ScopedFakeTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + result_->Append(result); +} + +namespace internal { + +// Returns the type ID of ::testing::Test. We should always call this +// instead of GetTypeId< ::testing::Test>() to get the type ID of +// testing::Test. This is to work around a suspected linker bug when +// using Google Test as a framework on Mac OS X. The bug causes +// GetTypeId< ::testing::Test>() to return different values depending +// on whether the call is from the Google Test framework itself or +// from user test code. GetTestTypeId() is guaranteed to always +// return the same value, as it always calls GetTypeId<>() from the +// gtest.cc, which is within the Google Test framework. +TypeId GetTestTypeId() { + return GetTypeId(); +} + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); + +// This predicate-formatter checks that 'results' contains a test part +// failure of the given type and that the failure message contains the +// given substring. +AssertionResult HasOneFailure(const char* /* results_expr */, + const char* /* type_expr */, + const char* /* substr_expr */, + const TestPartResultArray& results, + TestPartResult::Type type, + const string& substr) { + const std::string expected(type == TestPartResult::kFatalFailure ? + "1 fatal failure" : + "1 non-fatal failure"); + Message msg; + if (results.size() != 1) { + msg << "Expected: " << expected << "\n" + << " Actual: " << results.size() << " failures"; + for (int i = 0; i < results.size(); i++) { + msg << "\n" << results.GetTestPartResult(i); + } + return AssertionFailure() << msg; + } + + const TestPartResult& r = results.GetTestPartResult(0); + if (r.type() != type) { + return AssertionFailure() << "Expected: " << expected << "\n" + << " Actual:\n" + << r; + } + + if (strstr(r.message(), substr.c_str()) == NULL) { + return AssertionFailure() << "Expected: " << expected << " containing \"" + << substr << "\"\n" + << " Actual:\n" + << r; + } + + return AssertionSuccess(); +} + +// The constructor of SingleFailureChecker remembers where to look up +// test part results, what type of failure we expect, and what +// substring the failure message should contain. +SingleFailureChecker:: SingleFailureChecker( + const TestPartResultArray* results, + TestPartResult::Type type, + const string& substr) + : results_(results), + type_(type), + substr_(substr) {} + +// The destructor of SingleFailureChecker verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +SingleFailureChecker::~SingleFailureChecker() { + EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_); +} + +DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( + UnitTestImpl* unit_test) : unit_test_(unit_test) {} + +void DefaultGlobalTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->current_test_result()->AddTestPartResult(result); + unit_test_->listeners()->repeater()->OnTestPartResult(result); +} + +DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( + UnitTestImpl* unit_test) : unit_test_(unit_test) {} + +void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result); +} + +// Returns the global test part result reporter. +TestPartResultReporterInterface* +UnitTestImpl::GetGlobalTestPartResultReporter() { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + return global_test_part_result_repoter_; +} + +// Sets the global test part result reporter. +void UnitTestImpl::SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter) { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + global_test_part_result_repoter_ = reporter; +} + +// Returns the test part result reporter for the current thread. +TestPartResultReporterInterface* +UnitTestImpl::GetTestPartResultReporterForCurrentThread() { + return per_thread_test_part_result_reporter_.get(); +} + +// Sets the test part result reporter for the current thread. +void UnitTestImpl::SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter) { + per_thread_test_part_result_reporter_.set(reporter); +} + +// Gets the number of successful test cases. +int UnitTestImpl::successful_test_case_count() const { + return CountIf(test_cases_, TestCasePassed); +} + +// Gets the number of failed test cases. +int UnitTestImpl::failed_test_case_count() const { + return CountIf(test_cases_, TestCaseFailed); +} + +// Gets the number of all test cases. +int UnitTestImpl::total_test_case_count() const { + return static_cast(test_cases_.size()); +} + +// Gets the number of all test cases that contain at least one test +// that should run. +int UnitTestImpl::test_case_to_run_count() const { + return CountIf(test_cases_, ShouldRunTestCase); +} + +// Gets the number of successful tests. +int UnitTestImpl::successful_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count); +} + +// Gets the number of failed tests. +int UnitTestImpl::failed_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTestImpl::reportable_disabled_test_count() const { + return SumOverTestCaseList(test_cases_, + &TestCase::reportable_disabled_test_count); +} + +// Gets the number of disabled tests. +int UnitTestImpl::disabled_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTestImpl::reportable_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count); +} + +// Gets the number of all tests. +int UnitTestImpl::total_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::total_test_count); +} + +// Gets the number of tests that should run. +int UnitTestImpl::test_to_run_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count); +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// CurrentOsStackTraceExceptTop(1), Foo() will be included in the +// trace but Bar() and CurrentOsStackTraceExceptTop() won't. +std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { + return os_stack_trace_getter()->CurrentStackTrace( + static_cast(GTEST_FLAG(stack_trace_depth)), + skip_count + 1 + // Skips the user-specified number of frames plus this function + // itself. + ); // NOLINT +} + +// Returns the current time in milliseconds. +TimeInMillis GetTimeInMillis() { +#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__) + // Difference between 1970-01-01 and 1601-01-01 in milliseconds. + // http://analogous.blogspot.com/2005/04/epoch.html + const TimeInMillis kJavaEpochToWinFileTimeDelta = + static_cast(116444736UL) * 100000UL; + const DWORD kTenthMicrosInMilliSecond = 10000; + + SYSTEMTIME now_systime; + FILETIME now_filetime; + ULARGE_INTEGER now_int64; + // TODO(kenton@google.com): Shouldn't this just use + // GetSystemTimeAsFileTime()? + GetSystemTime(&now_systime); + if (SystemTimeToFileTime(&now_systime, &now_filetime)) { + now_int64.LowPart = now_filetime.dwLowDateTime; + now_int64.HighPart = now_filetime.dwHighDateTime; + now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) - + kJavaEpochToWinFileTimeDelta; + return now_int64.QuadPart; + } + return 0; +#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_ + __timeb64 now; + + // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 + // (deprecated function) there. + // TODO(kenton@google.com): Use GetTickCount()? Or use + // SystemTimeToFileTime() + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) + _ftime64(&now); + GTEST_DISABLE_MSC_WARNINGS_POP_() + + return static_cast(now.time) * 1000 + now.millitm; +#elif GTEST_HAS_GETTIMEOFDAY_ + struct timeval now; + gettimeofday(&now, NULL); + return static_cast(now.tv_sec) * 1000 + now.tv_usec / 1000; +#else +# error "Don't know how to get the current time on your system." +#endif +} + +// Utilities + +// class String. + +#if GTEST_OS_WINDOWS_MOBILE +// Creates a UTF-16 wide string from the given ANSI string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the wide string, or NULL if the +// input is NULL. +LPCWSTR String::AnsiToUtf16(const char* ansi) { + if (!ansi) return NULL; + const int length = strlen(ansi); + const int unicode_length = + MultiByteToWideChar(CP_ACP, 0, ansi, length, + NULL, 0); + WCHAR* unicode = new WCHAR[unicode_length + 1]; + MultiByteToWideChar(CP_ACP, 0, ansi, length, + unicode, unicode_length); + unicode[unicode_length] = 0; + return unicode; +} + +// Creates an ANSI string from the given wide string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the ANSI string, or NULL if the +// input is NULL. +const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { + if (!utf16_str) return NULL; + const int ansi_length = + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, + NULL, 0, NULL, NULL); + char* ansi = new char[ansi_length + 1]; + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, + ansi, ansi_length, NULL, NULL); + ansi[ansi_length] = 0; + return ansi; +} + +#endif // GTEST_OS_WINDOWS_MOBILE + +// Compares two C strings. Returns true iff they have the same content. +// +// Unlike strcmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CStringEquals(const char * lhs, const char * rhs) { + if ( lhs == NULL ) return rhs == NULL; + + if ( rhs == NULL ) return false; + + return strcmp(lhs, rhs) == 0; +} + +#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING + +// Converts an array of wide chars to a narrow string using the UTF-8 +// encoding, and streams the result to the given Message object. +static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, + Message* msg) { + for (size_t i = 0; i != length; ) { // NOLINT + if (wstr[i] != L'\0') { + *msg << WideStringToUtf8(wstr + i, static_cast(length - i)); + while (i != length && wstr[i] != L'\0') + i++; + } else { + *msg << '\0'; + i++; + } + } +} + +#endif // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING + +void SplitString(const ::std::string& str, char delimiter, + ::std::vector< ::std::string>* dest) { + ::std::vector< ::std::string> parsed; + ::std::string::size_type pos = 0; + while (::testing::internal::AlwaysTrue()) { + const ::std::string::size_type colon = str.find(delimiter, pos); + if (colon == ::std::string::npos) { + parsed.push_back(str.substr(pos)); + break; + } else { + parsed.push_back(str.substr(pos, colon - pos)); + pos = colon + 1; + } + } + dest->swap(parsed); +} + +} // namespace internal + +// Constructs an empty Message. +// We allocate the stringstream separately because otherwise each use of +// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's +// stack frame leading to huge stack frames in some cases; gcc does not reuse +// the stack space. +Message::Message() : ss_(new ::std::stringstream) { + // By default, we want there to be enough precision when printing + // a double to a Message. + *ss_ << std::setprecision(std::numeric_limits::digits10 + 2); +} + +// These two overloads allow streaming a wide C string to a Message +// using the UTF-8 encoding. +Message& Message::operator <<(const wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} +Message& Message::operator <<(wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} + +#if GTEST_HAS_STD_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message& Message::operator <<(const ::std::wstring& wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_GLOBAL_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message& Message::operator <<(const ::wstring& wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +// Gets the text streamed to this object so far as an std::string. +// Each '\0' character in the buffer is replaced with "\\0". +std::string Message::GetString() const { + return internal::StringStreamToString(ss_.get()); +} + +// AssertionResult constructors. +// Used in EXPECT_TRUE/FALSE(assertion_result). +AssertionResult::AssertionResult(const AssertionResult& other) + : success_(other.success_), + message_(other.message_.get() != NULL ? + new ::std::string(*other.message_) : + static_cast< ::std::string*>(NULL)) { +} + +// Swaps two AssertionResults. +void AssertionResult::swap(AssertionResult& other) { + using std::swap; + swap(success_, other.success_); + swap(message_, other.message_); +} + +// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. +AssertionResult AssertionResult::operator!() const { + AssertionResult negation(!success_); + if (message_.get() != NULL) + negation << *message_; + return negation; +} + +// Makes a successful assertion result. +AssertionResult AssertionSuccess() { + return AssertionResult(true); +} + +// Makes a failed assertion result. +AssertionResult AssertionFailure() { + return AssertionResult(false); +} + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << message. +AssertionResult AssertionFailure(const Message& message) { + return AssertionFailure() << message; +} + +namespace internal { + +namespace edit_distance { +std::vector CalculateOptimalEdits(const std::vector& left, + const std::vector& right) { + std::vector > costs( + left.size() + 1, std::vector(right.size() + 1)); + std::vector > best_move( + left.size() + 1, std::vector(right.size() + 1)); + + // Populate for empty right. + for (size_t l_i = 0; l_i < costs.size(); ++l_i) { + costs[l_i][0] = static_cast(l_i); + best_move[l_i][0] = kRemove; + } + // Populate for empty left. + for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) { + costs[0][r_i] = static_cast(r_i); + best_move[0][r_i] = kAdd; + } + + for (size_t l_i = 0; l_i < left.size(); ++l_i) { + for (size_t r_i = 0; r_i < right.size(); ++r_i) { + if (left[l_i] == right[r_i]) { + // Found a match. Consume it. + costs[l_i + 1][r_i + 1] = costs[l_i][r_i]; + best_move[l_i + 1][r_i + 1] = kMatch; + continue; + } + + const double add = costs[l_i + 1][r_i]; + const double remove = costs[l_i][r_i + 1]; + const double replace = costs[l_i][r_i]; + if (add < remove && add < replace) { + costs[l_i + 1][r_i + 1] = add + 1; + best_move[l_i + 1][r_i + 1] = kAdd; + } else if (remove < add && remove < replace) { + costs[l_i + 1][r_i + 1] = remove + 1; + best_move[l_i + 1][r_i + 1] = kRemove; + } else { + // We make replace a little more expensive than add/remove to lower + // their priority. + costs[l_i + 1][r_i + 1] = replace + 1.00001; + best_move[l_i + 1][r_i + 1] = kReplace; + } + } + } + + // Reconstruct the best path. We do it in reverse order. + std::vector best_path; + for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) { + EditType move = best_move[l_i][r_i]; + best_path.push_back(move); + l_i -= move != kAdd; + r_i -= move != kRemove; + } + std::reverse(best_path.begin(), best_path.end()); + return best_path; +} + +namespace { + +// Helper class to convert string into ids with deduplication. +class InternalStrings { + public: + size_t GetId(const std::string& str) { + IdMap::iterator it = ids_.find(str); + if (it != ids_.end()) return it->second; + size_t id = ids_.size(); + return ids_[str] = id; + } + + private: + typedef std::map IdMap; + IdMap ids_; +}; + +} // namespace + +std::vector CalculateOptimalEdits( + const std::vector& left, + const std::vector& right) { + std::vector left_ids, right_ids; + { + InternalStrings intern_table; + for (size_t i = 0; i < left.size(); ++i) { + left_ids.push_back(intern_table.GetId(left[i])); + } + for (size_t i = 0; i < right.size(); ++i) { + right_ids.push_back(intern_table.GetId(right[i])); + } + } + return CalculateOptimalEdits(left_ids, right_ids); +} + +namespace { + +// Helper class that holds the state for one hunk and prints it out to the +// stream. +// It reorders adds/removes when possible to group all removes before all +// adds. It also adds the hunk header before printint into the stream. +class Hunk { + public: + Hunk(size_t left_start, size_t right_start) + : left_start_(left_start), + right_start_(right_start), + adds_(), + removes_(), + common_() {} + + void PushLine(char edit, const char* line) { + switch (edit) { + case ' ': + ++common_; + FlushEdits(); + hunk_.push_back(std::make_pair(' ', line)); + break; + case '-': + ++removes_; + hunk_removes_.push_back(std::make_pair('-', line)); + break; + case '+': + ++adds_; + hunk_adds_.push_back(std::make_pair('+', line)); + break; + } + } + + void PrintTo(std::ostream* os) { + PrintHeader(os); + FlushEdits(); + for (std::list >::const_iterator it = + hunk_.begin(); + it != hunk_.end(); ++it) { + *os << it->first << it->second << "\n"; + } + } + + bool has_edits() const { return adds_ || removes_; } + + private: + void FlushEdits() { + hunk_.splice(hunk_.end(), hunk_removes_); + hunk_.splice(hunk_.end(), hunk_adds_); + } + + // Print a unified diff header for one hunk. + // The format is + // "@@ -, +, @@" + // where the left/right parts are ommitted if unnecessary. + void PrintHeader(std::ostream* ss) const { + *ss << "@@ "; + if (removes_) { + *ss << "-" << left_start_ << "," << (removes_ + common_); + } + if (removes_ && adds_) { + *ss << " "; + } + if (adds_) { + *ss << "+" << right_start_ << "," << (adds_ + common_); + } + *ss << " @@\n"; + } + + size_t left_start_, right_start_; + size_t adds_, removes_, common_; + std::list > hunk_, hunk_adds_, hunk_removes_; +}; + +} // namespace + +// Create a list of diff hunks in Unified diff format. +// Each hunk has a header generated by PrintHeader above plus a body with +// lines prefixed with ' ' for no change, '-' for deletion and '+' for +// addition. +// 'context' represents the desired unchanged prefix/suffix around the diff. +// If two hunks are close enough that their contexts overlap, then they are +// joined into one hunk. +std::string CreateUnifiedDiff(const std::vector& left, + const std::vector& right, + size_t context) { + const std::vector edits = CalculateOptimalEdits(left, right); + + size_t l_i = 0, r_i = 0, edit_i = 0; + std::stringstream ss; + while (edit_i < edits.size()) { + // Find first edit. + while (edit_i < edits.size() && edits[edit_i] == kMatch) { + ++l_i; + ++r_i; + ++edit_i; + } + + // Find the first line to include in the hunk. + const size_t prefix_context = std::min(l_i, context); + Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1); + for (size_t i = prefix_context; i > 0; --i) { + hunk.PushLine(' ', left[l_i - i].c_str()); + } + + // Iterate the edits until we found enough suffix for the hunk or the input + // is over. + size_t n_suffix = 0; + for (; edit_i < edits.size(); ++edit_i) { + if (n_suffix >= context) { + // Continue only if the next hunk is very close. + std::vector::const_iterator it = edits.begin() + edit_i; + while (it != edits.end() && *it == kMatch) ++it; + if (it == edits.end() || (it - edits.begin()) - edit_i >= context) { + // There is no next edit or it is too far away. + break; + } + } + + EditType edit = edits[edit_i]; + // Reset count when a non match is found. + n_suffix = edit == kMatch ? n_suffix + 1 : 0; + + if (edit == kMatch || edit == kRemove || edit == kReplace) { + hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str()); + } + if (edit == kAdd || edit == kReplace) { + hunk.PushLine('+', right[r_i].c_str()); + } + + // Advance indices, depending on edit type. + l_i += edit != kAdd; + r_i += edit != kRemove; + } + + if (!hunk.has_edits()) { + // We are done. We don't want this hunk. + break; + } + + hunk.PrintTo(&ss); + } + return ss.str(); +} + +} // namespace edit_distance + +namespace { + +// The string representation of the values received in EqFailure() are already +// escaped. Split them on escaped '\n' boundaries. Leave all other escaped +// characters the same. +std::vector SplitEscapedString(const std::string& str) { + std::vector lines; + size_t start = 0, end = str.size(); + if (end > 2 && str[0] == '"' && str[end - 1] == '"') { + ++start; + --end; + } + bool escaped = false; + for (size_t i = start; i + 1 < end; ++i) { + if (escaped) { + escaped = false; + if (str[i] == 'n') { + lines.push_back(str.substr(start, i - start - 1)); + start = i + 1; + } + } else { + escaped = str[i] == '\\'; + } + } + lines.push_back(str.substr(start, end - start)); + return lines; +} + +} // namespace + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// lhs_expression: "foo" +// rhs_expression: "bar" +// lhs_value: "5" +// rhs_value: "6" +// +// The ignoring_case parameter is true iff the assertion is a +// *_STRCASEEQ*. When it's true, the string "Ignoring case" will +// be inserted into the message. +AssertionResult EqFailure(const char* lhs_expression, + const char* rhs_expression, + const std::string& lhs_value, + const std::string& rhs_value, + bool ignoring_case) { + Message msg; + msg << " Expected: " << lhs_expression; + if (lhs_value != lhs_expression) { + msg << "\n Which is: " << lhs_value; + } + msg << "\nTo be equal to: " << rhs_expression; + if (rhs_value != rhs_expression) { + msg << "\n Which is: " << rhs_value; + } + + if (ignoring_case) { + msg << "\nIgnoring case"; + } + + if (!lhs_value.empty() && !rhs_value.empty()) { + const std::vector lhs_lines = + SplitEscapedString(lhs_value); + const std::vector rhs_lines = + SplitEscapedString(rhs_value); + if (lhs_lines.size() > 1 || rhs_lines.size() > 1) { + msg << "\nWith diff:\n" + << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines); + } + } + + return AssertionFailure() << msg; +} + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, + const char* expression_text, + const char* actual_predicate_value, + const char* expected_predicate_value) { + const char* actual_message = assertion_result.message(); + Message msg; + msg << "Value of: " << expression_text + << "\n Actual: " << actual_predicate_value; + if (actual_message[0] != '\0') + msg << " (" << actual_message << ")"; + msg << "\nExpected: " << expected_predicate_value; + return msg.GetString(); +} + +// Helper function for implementing ASSERT_NEAR. +AssertionResult DoubleNearPredFormat(const char* expr1, + const char* expr2, + const char* abs_error_expr, + double val1, + double val2, + double abs_error) { + const double diff = fabs(val1 - val2); + if (diff <= abs_error) return AssertionSuccess(); + + // TODO(wan): do not print the value of an expression if it's + // already a literal. + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 + << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ", and\n" + << abs_error_expr << " evaluates to " << abs_error << "."; +} + + +// Helper template for implementing FloatLE() and DoubleLE(). +template +AssertionResult FloatingPointLE(const char* expr1, + const char* expr2, + RawType val1, + RawType val2) { + // Returns success if val1 is less than val2, + if (val1 < val2) { + return AssertionSuccess(); + } + + // or if val1 is almost equal to val2. + const FloatingPoint lhs(val1), rhs(val2); + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + // Note that the above two checks will both fail if either val1 or + // val2 is NaN, as the IEEE floating-point standard requires that + // any predicate involving a NaN must return false. + + ::std::stringstream val1_ss; + val1_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val1; + + ::std::stringstream val2_ss; + val2_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val2; + + return AssertionFailure() + << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" + << " Actual: " << StringStreamToString(&val1_ss) << " vs " + << StringStreamToString(&val2_ss); +} + +} // namespace internal + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult FloatLE(const char* expr1, const char* expr2, + float val1, float val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult DoubleLE(const char* expr1, const char* expr2, + double val1, double val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +namespace internal { + +// The helper function for {ASSERT|EXPECT}_EQ with int or enum +// arguments. +AssertionResult CmpHelperEQ(const char* lhs_expression, + const char* rhs_expression, + BiggestInt lhs, + BiggestInt rhs) { + if (lhs == rhs) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, + rhs_expression, + FormatForComparisonFailureMessage(lhs, rhs), + FormatForComparisonFailureMessage(rhs, lhs), + false); +} + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_?? with integer or enum arguments. It is here +// just to avoid copy-and-paste of similar code. +#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ +AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + BiggestInt val1, BiggestInt val2) {\ + if (val1 op val2) {\ + return AssertionSuccess();\ + } else {\ + return AssertionFailure() \ + << "Expected: (" << expr1 << ") " #op " (" << expr2\ + << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ + << " vs " << FormatForComparisonFailureMessage(val2, val1);\ + }\ +} + +// Implements the helper function for {ASSERT|EXPECT}_NE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(NE, !=) +// Implements the helper function for {ASSERT|EXPECT}_LE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(LE, <=) +// Implements the helper function for {ASSERT|EXPECT}_LT with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(LT, < ) +// Implements the helper function for {ASSERT|EXPECT}_GE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(GE, >=) +// Implements the helper function for {ASSERT|EXPECT}_GT with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(GT, > ) + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +AssertionResult CmpHelperSTREQ(const char* lhs_expression, + const char* rhs_expression, + const char* lhs, + const char* rhs) { + if (String::CStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, + rhs_expression, + PrintToString(lhs), + PrintToString(rhs), + false); +} + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression, + const char* rhs_expression, + const char* lhs, + const char* rhs) { + if (String::CaseInsensitiveCStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, + rhs_expression, + PrintToString(lhs), + PrintToString(rhs), + true); +} + +// The helper function for {ASSERT|EXPECT}_STRNE. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2) { + if (!String::CStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() << "Expected: (" << s1_expression << ") != (" + << s2_expression << "), actual: \"" + << s1 << "\" vs \"" << s2 << "\""; + } +} + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2) { + if (!String::CaseInsensitiveCStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" + << s2_expression << ") (ignoring case), actual: \"" + << s1 << "\" vs \"" << s2 << "\""; + } +} + +} // namespace internal + +namespace { + +// Helper functions for implementing IsSubString() and IsNotSubstring(). + +// This group of overloaded functions return true iff needle is a +// substring of haystack. NULL is considered a substring of itself +// only. + +bool IsSubstringPred(const char* needle, const char* haystack) { + if (needle == NULL || haystack == NULL) + return needle == haystack; + + return strstr(haystack, needle) != NULL; +} + +bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { + if (needle == NULL || haystack == NULL) + return needle == haystack; + + return wcsstr(haystack, needle) != NULL; +} + +// StringType here can be either ::std::string or ::std::wstring. +template +bool IsSubstringPred(const StringType& needle, + const StringType& haystack) { + return haystack.find(needle) != StringType::npos; +} + +// This function implements either IsSubstring() or IsNotSubstring(), +// depending on the value of the expected_to_be_substring parameter. +// StringType here can be const char*, const wchar_t*, ::std::string, +// or ::std::wstring. +template +AssertionResult IsSubstringImpl( + bool expected_to_be_substring, + const char* needle_expr, const char* haystack_expr, + const StringType& needle, const StringType& haystack) { + if (IsSubstringPred(needle, haystack) == expected_to_be_substring) + return AssertionSuccess(); + + const bool is_wide_string = sizeof(needle[0]) > 1; + const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; + return AssertionFailure() + << "Value of: " << needle_expr << "\n" + << " Actual: " << begin_string_quote << needle << "\"\n" + << "Expected: " << (expected_to_be_substring ? "" : "not ") + << "a substring of " << haystack_expr << "\n" + << "Which is: " << begin_string_quote << haystack << "\""; +} + +} // namespace + +// IsSubstring() and IsNotSubstring() check whether needle is a +// substring of haystack (NULL is considered a substring of itself +// only), and return an appropriate error message when they fail. + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +#if GTEST_HAS_STD_WSTRING +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +#if GTEST_OS_WINDOWS + +namespace { + +// Helper function for IsHRESULT{SuccessFailure} predicates +AssertionResult HRESULTFailureHelper(const char* expr, + const char* expected, + long hr) { // NOLINT +# if GTEST_OS_WINDOWS_MOBILE + + // Windows CE doesn't support FormatMessage. + const char error_text[] = ""; + +# else + + // Looks up the human-readable system message for the HRESULT code + // and since we're not passing any params to FormatMessage, we don't + // want inserts expanded. + const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS; + const DWORD kBufSize = 4096; + // Gets the system's human readable message string for this HRESULT. + char error_text[kBufSize] = { '\0' }; + DWORD message_length = ::FormatMessageA(kFlags, + 0, // no source, we're asking system + hr, // the error + 0, // no line width restrictions + error_text, // output buffer + kBufSize, // buf size + NULL); // no arguments for inserts + // Trims tailing white space (FormatMessage leaves a trailing CR-LF) + for (; message_length && IsSpace(error_text[message_length - 1]); + --message_length) { + error_text[message_length - 1] = '\0'; + } + +# endif // GTEST_OS_WINDOWS_MOBILE + + const std::string error_hex("0x" + String::FormatHexInt(hr)); + return ::testing::AssertionFailure() + << "Expected: " << expr << " " << expected << ".\n" + << " Actual: " << error_hex << " " << error_text << "\n"; +} + +} // namespace + +AssertionResult IsHRESULTSuccess(const char* expr, long hr) { // NOLINT + if (SUCCEEDED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "succeeds", hr); +} + +AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT + if (FAILED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "fails", hr); +} + +#endif // GTEST_OS_WINDOWS + +// Utility functions for encoding Unicode text (wide strings) in +// UTF-8. + +// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8 +// like this: +// +// Code-point length Encoding +// 0 - 7 bits 0xxxxxxx +// 8 - 11 bits 110xxxxx 10xxxxxx +// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx +// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +// The maximum code-point a one-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint1 = (static_cast(1) << 7) - 1; + +// The maximum code-point a two-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; + +// The maximum code-point a three-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint3 = (static_cast(1) << (4 + 2*6)) - 1; + +// The maximum code-point a four-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint4 = (static_cast(1) << (3 + 3*6)) - 1; + +// Chops off the n lowest bits from a bit pattern. Returns the n +// lowest bits. As a side effect, the original bit pattern will be +// shifted to the right by n bits. +inline UInt32 ChopLowBits(UInt32* bits, int n) { + const UInt32 low_bits = *bits & ((static_cast(1) << n) - 1); + *bits >>= n; + return low_bits; +} + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +std::string CodePointToUtf8(UInt32 code_point) { + if (code_point > kMaxCodePoint4) { + return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")"; + } + + char str[5]; // Big enough for the largest valid code point. + if (code_point <= kMaxCodePoint1) { + str[1] = '\0'; + str[0] = static_cast(code_point); // 0xxxxxxx + } else if (code_point <= kMaxCodePoint2) { + str[2] = '\0'; + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xC0 | code_point); // 110xxxxx + } else if (code_point <= kMaxCodePoint3) { + str[3] = '\0'; + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xE0 | code_point); // 1110xxxx + } else { // code_point <= kMaxCodePoint4 + str[4] = '\0'; + str[3] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xF0 | code_point); // 11110xxx + } + return str; +} + +// The following two functions only make sense if the the system +// uses UTF-16 for wide string encoding. All supported systems +// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16. + +// Determines if the arguments constitute UTF-16 surrogate pair +// and thus should be combined into a single Unicode code point +// using CreateCodePointFromUtf16SurrogatePair. +inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { + return sizeof(wchar_t) == 2 && + (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00; +} + +// Creates a Unicode code point from UTF16 surrogate pair. +inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first, + wchar_t second) { + const UInt32 mask = (1 << 10) - 1; + return (sizeof(wchar_t) == 2) ? + (((first & mask) << 10) | (second & mask)) + 0x10000 : + // This function should not be called when the condition is + // false, but we provide a sensible default in case it is. + static_cast(first); +} + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +std::string WideStringToUtf8(const wchar_t* str, int num_chars) { + if (num_chars == -1) + num_chars = static_cast(wcslen(str)); + + ::std::stringstream stream; + for (int i = 0; i < num_chars; ++i) { + UInt32 unicode_code_point; + + if (str[i] == L'\0') { + break; + } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { + unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i], + str[i + 1]); + i++; + } else { + unicode_code_point = static_cast(str[i]); + } + + stream << CodePointToUtf8(unicode_code_point); + } + return StringStreamToString(&stream); +} + +// Converts a wide C string to an std::string using the UTF-8 encoding. +// NULL will be converted to "(null)". +std::string String::ShowWideCString(const wchar_t * wide_c_str) { + if (wide_c_str == NULL) return "(null)"; + + return internal::WideStringToUtf8(wide_c_str, -1); +} + +// Compares two wide C strings. Returns true iff they have the same +// content. +// +// Unlike wcscmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { + if (lhs == NULL) return rhs == NULL; + + if (rhs == NULL) return false; + + return wcscmp(lhs, rhs) == 0; +} + +// Helper function for *_STREQ on wide strings. +AssertionResult CmpHelperSTREQ(const char* lhs_expression, + const char* rhs_expression, + const wchar_t* lhs, + const wchar_t* rhs) { + if (String::WideCStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, + rhs_expression, + PrintToString(lhs), + PrintToString(rhs), + false); +} + +// Helper function for *_STRNE on wide strings. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, + const wchar_t* s2) { + if (!String::WideCStringEquals(s1, s2)) { + return AssertionSuccess(); + } + + return AssertionFailure() << "Expected: (" << s1_expression << ") != (" + << s2_expression << "), actual: " + << PrintToString(s1) + << " vs " << PrintToString(s2); +} + +// Compares two C strings, ignoring case. Returns true iff they have +// the same content. +// +// Unlike strcasecmp(), this function can handle NULL argument(s). A +// NULL C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) { + if (lhs == NULL) + return rhs == NULL; + if (rhs == NULL) + return false; + return posix::StrCaseCmp(lhs, rhs) == 0; +} + + // Compares two wide C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. +bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs) { + if (lhs == NULL) return rhs == NULL; + + if (rhs == NULL) return false; + +#if GTEST_OS_WINDOWS + return _wcsicmp(lhs, rhs) == 0; +#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID + return wcscasecmp(lhs, rhs) == 0; +#else + // Android, Mac OS X and Cygwin don't define wcscasecmp. + // Other unknown OSes may not define it either. + wint_t left, right; + do { + left = towlower(*lhs++); + right = towlower(*rhs++); + } while (left && left == right); + return left == right; +#endif // OS selector +} + +// Returns true iff str ends with the given suffix, ignoring case. +// Any string is considered to end with an empty suffix. +bool String::EndsWithCaseInsensitive( + const std::string& str, const std::string& suffix) { + const size_t str_len = str.length(); + const size_t suffix_len = suffix.length(); + return (str_len >= suffix_len) && + CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len, + suffix.c_str()); +} + +// Formats an int value as "%02d". +std::string String::FormatIntWidth2(int value) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(2) << value; + return ss.str(); +} + +// Formats an int value as "%X". +std::string String::FormatHexInt(int value) { + std::stringstream ss; + ss << std::hex << std::uppercase << value; + return ss.str(); +} + +// Formats a byte as "%02X". +std::string String::FormatByte(unsigned char value) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase + << static_cast(value); + return ss.str(); +} + +// Converts the buffer in a stringstream to an std::string, converting NUL +// bytes to "\\0" along the way. +std::string StringStreamToString(::std::stringstream* ss) { + const ::std::string& str = ss->str(); + const char* const start = str.c_str(); + const char* const end = start + str.length(); + + std::string result; + result.reserve(2 * (end - start)); + for (const char* ch = start; ch != end; ++ch) { + if (*ch == '\0') { + result += "\\0"; // Replaces NUL with "\\0"; + } else { + result += *ch; + } + } + + return result; +} + +// Appends the user-supplied message to the Google-Test-generated message. +std::string AppendUserMessage(const std::string& gtest_msg, + const Message& user_msg) { + // Appends the user message if it's non-empty. + const std::string user_msg_string = user_msg.GetString(); + if (user_msg_string.empty()) { + return gtest_msg; + } + + return gtest_msg + "\n" + user_msg_string; +} + +} // namespace internal + +// class TestResult + +// Creates an empty TestResult. +TestResult::TestResult() + : death_test_count_(0), + elapsed_time_(0) { +} + +// D'tor. +TestResult::~TestResult() { +} + +// Returns the i-th test part result among all the results. i can +// range from 0 to total_part_count() - 1. If i is not in that range, +// aborts the program. +const TestPartResult& TestResult::GetTestPartResult(int i) const { + if (i < 0 || i >= total_part_count()) + internal::posix::Abort(); + return test_part_results_.at(i); +} + +// Returns the i-th test property. i can range from 0 to +// test_property_count() - 1. If i is not in that range, aborts the +// program. +const TestProperty& TestResult::GetTestProperty(int i) const { + if (i < 0 || i >= test_property_count()) + internal::posix::Abort(); + return test_properties_.at(i); +} + +// Clears the test part results. +void TestResult::ClearTestPartResults() { + test_part_results_.clear(); +} + +// Adds a test part result to the list. +void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { + test_part_results_.push_back(test_part_result); +} + +// Adds a test property to the list. If a property with the same key as the +// supplied property is already represented, the value of this test_property +// replaces the old value for that key. +void TestResult::RecordProperty(const std::string& xml_element, + const TestProperty& test_property) { + if (!ValidateTestProperty(xml_element, test_property)) { + return; + } + internal::MutexLock lock(&test_properites_mutex_); + const std::vector::iterator property_with_matching_key = + std::find_if(test_properties_.begin(), test_properties_.end(), + internal::TestPropertyKeyIs(test_property.key())); + if (property_with_matching_key == test_properties_.end()) { + test_properties_.push_back(test_property); + return; + } + property_with_matching_key->SetValue(test_property.value()); +} + +// The list of reserved attributes used in the element of XML +// output. +static const char* const kReservedTestSuitesAttributes[] = { + "disabled", + "errors", + "failures", + "name", + "random_seed", + "tests", + "time", + "timestamp" +}; + +// The list of reserved attributes used in the element of XML +// output. +static const char* const kReservedTestSuiteAttributes[] = { + "disabled", + "errors", + "failures", + "name", + "tests", + "time" +}; + +// The list of reserved attributes used in the element of XML output. +static const char* const kReservedTestCaseAttributes[] = { + "classname", + "name", + "status", + "time", + "type_param", + "value_param" +}; + +template +std::vector ArrayAsVector(const char* const (&array)[kSize]) { + return std::vector(array, array + kSize); +} + +static std::vector GetReservedAttributesForElement( + const std::string& xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector(); +} + +static std::string FormatWordList(const std::vector& words) { + Message word_list; + for (size_t i = 0; i < words.size(); ++i) { + if (i > 0 && words.size() > 2) { + word_list << ", "; + } + if (i == words.size() - 1) { + word_list << "and "; + } + word_list << "'" << words[i] << "'"; + } + return word_list.GetString(); +} + +bool ValidateTestPropertyName(const std::string& property_name, + const std::vector& reserved_names) { + if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != + reserved_names.end()) { + ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name + << " (" << FormatWordList(reserved_names) + << " are reserved by " << GTEST_NAME_ << ")"; + return false; + } + return true; +} + +// Adds a failure if the key is a reserved attribute of the element named +// xml_element. Returns true if the property is valid. +bool TestResult::ValidateTestProperty(const std::string& xml_element, + const TestProperty& test_property) { + return ValidateTestPropertyName(test_property.key(), + GetReservedAttributesForElement(xml_element)); +} + +// Clears the object. +void TestResult::Clear() { + test_part_results_.clear(); + test_properties_.clear(); + death_test_count_ = 0; + elapsed_time_ = 0; +} + +// Returns true iff the test failed. +bool TestResult::Failed() const { + for (int i = 0; i < total_part_count(); ++i) { + if (GetTestPartResult(i).failed()) + return true; + } + return false; +} + +// Returns true iff the test part fatally failed. +static bool TestPartFatallyFailed(const TestPartResult& result) { + return result.fatally_failed(); +} + +// Returns true iff the test fatally failed. +bool TestResult::HasFatalFailure() const { + return CountIf(test_part_results_, TestPartFatallyFailed) > 0; +} + +// Returns true iff the test part non-fatally failed. +static bool TestPartNonfatallyFailed(const TestPartResult& result) { + return result.nonfatally_failed(); +} + +// Returns true iff the test has a non-fatal failure. +bool TestResult::HasNonfatalFailure() const { + return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; +} + +// Gets the number of all test parts. This is the sum of the number +// of successful test parts and the number of failed test parts. +int TestResult::total_part_count() const { + return static_cast(test_part_results_.size()); +} + +// Returns the number of the test properties. +int TestResult::test_property_count() const { + return static_cast(test_properties_.size()); +} + +// class Test + +// Creates a Test object. + +// The c'tor saves the states of all flags. +Test::Test() + : gtest_flag_saver_(new GTEST_FLAG_SAVER_) { +} + +// The d'tor restores the states of all flags. The actual work is +// done by the d'tor of the gtest_flag_saver_ field, and thus not +// visible here. +Test::~Test() { +} + +// Sets up the test fixture. +// +// A sub-class may override this. +void Test::SetUp() { +} + +// Tears down the test fixture. +// +// A sub-class may override this. +void Test::TearDown() { +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string& key, const std::string& value) { + UnitTest::GetInstance()->RecordProperty(key, value); +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string& key, int value) { + Message value_message; + value_message << value; + RecordProperty(key, value_message.GetString().c_str()); +} + +namespace internal { + +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string& message) { + // This function is a friend of UnitTest and as such has access to + // AddTestPartResult. + UnitTest::GetInstance()->AddTestPartResult( + result_type, + NULL, // No info about the source file where the exception occurred. + -1, // We have no info on which line caused the exception. + message, + ""); // No stack trace, either. +} + +} // namespace internal + +// Google Test requires all tests in the same test case to use the same test +// fixture class. This function checks if the current test has the +// same fixture class as the first test in the current test case. If +// yes, it returns true; otherwise it generates a Google Test failure and +// returns false. +bool Test::HasSameFixtureClass() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + const TestCase* const test_case = impl->current_test_case(); + + // Info about the first test in the current test case. + const TestInfo* const first_test_info = test_case->test_info_list()[0]; + const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; + const char* const first_test_name = first_test_info->name(); + + // Info about the current test. + const TestInfo* const this_test_info = impl->current_test_info(); + const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_; + const char* const this_test_name = this_test_info->name(); + + if (this_fixture_id != first_fixture_id) { + // Is the first test defined using TEST? + const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId(); + // Is this test defined using TEST? + const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); + + if (first_is_TEST || this_is_TEST) { + // Both TEST and TEST_F appear in same test case, which is incorrect. + // Tell the user how to fix this. + + // Gets the name of the TEST and the name of the TEST_F. Note + // that first_is_TEST and this_is_TEST cannot both be true, as + // the fixture IDs are different for the two tests. + const char* const TEST_name = + first_is_TEST ? first_test_name : this_test_name; + const char* const TEST_F_name = + first_is_TEST ? this_test_name : first_test_name; + + ADD_FAILURE() + << "All tests in the same test case must use the same test fixture\n" + << "class, so mixing TEST_F and TEST in the same test case is\n" + << "illegal. In test case " << this_test_info->test_case_name() + << ",\n" + << "test " << TEST_F_name << " is defined using TEST_F but\n" + << "test " << TEST_name << " is defined using TEST. You probably\n" + << "want to change the TEST to TEST_F or move it to another test\n" + << "case."; + } else { + // Two fixture classes with the same name appear in two different + // namespaces, which is not allowed. Tell the user how to fix this. + ADD_FAILURE() + << "All tests in the same test case must use the same test fixture\n" + << "class. However, in test case " + << this_test_info->test_case_name() << ",\n" + << "you defined test " << first_test_name + << " and test " << this_test_name << "\n" + << "using two different test fixture classes. This can happen if\n" + << "the two classes are from different namespaces or translation\n" + << "units and have the same name. You should probably rename one\n" + << "of the classes to put the tests into different test cases."; + } + return false; + } + + return true; +} + +#if GTEST_HAS_SEH + +// Adds an "exception thrown" fatal failure to the current test. This +// function returns its result via an output parameter pointer because VC++ +// prohibits creation of objects with destructors on stack in functions +// using __try (see error C2712). +static std::string* FormatSehExceptionMessage(DWORD exception_code, + const char* location) { + Message message; + message << "SEH exception with code 0x" << std::setbase(16) << + exception_code << std::setbase(10) << " thrown in " << location << "."; + + return new std::string(message.GetString()); +} + +#endif // GTEST_HAS_SEH + +namespace internal { + +#if GTEST_HAS_EXCEPTIONS + +// Adds an "exception thrown" fatal failure to the current test. +static std::string FormatCxxExceptionMessage(const char* description, + const char* location) { + Message message; + if (description != NULL) { + message << "C++ exception with description \"" << description << "\""; + } else { + message << "Unknown C++ exception"; + } + message << " thrown in " << location << "."; + + return message.GetString(); +} + +static std::string PrintTestPartResultToString( + const TestPartResult& test_part_result); + +GoogleTestFailureException::GoogleTestFailureException( + const TestPartResult& failure) + : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {} + +#endif // GTEST_HAS_EXCEPTIONS + +// We put these helper functions in the internal namespace as IBM's xlC +// compiler rejects the code if they were declared static. + +// Runs the given method and handles SEH exceptions it throws, when +// SEH is supported; returns the 0-value for type Result in case of an +// SEH exception. (Microsoft compilers cannot handle SEH and C++ +// exceptions in the same function. Therefore, we provide a separate +// wrapper function for handling SEH exceptions.) +template +Result HandleSehExceptionsInMethodIfSupported( + T* object, Result (T::*method)(), const char* location) { +#if GTEST_HAS_SEH + __try { + return (object->*method)(); + } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT + GetExceptionCode())) { + // We create the exception message on the heap because VC++ prohibits + // creation of objects with destructors on stack in functions using __try + // (see error C2712). + std::string* exception_message = FormatSehExceptionMessage( + GetExceptionCode(), location); + internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, + *exception_message); + delete exception_message; + return static_cast(0); + } +#else + (void)location; + return (object->*method)(); +#endif // GTEST_HAS_SEH +} + +// Runs the given method and catches and reports C++ and/or SEH-style +// exceptions, if they are supported; returns the 0-value for type +// Result in case of an SEH exception. +template +Result HandleExceptionsInMethodIfSupported( + T* object, Result (T::*method)(), const char* location) { + // NOTE: The user code can affect the way in which Google Test handles + // exceptions by setting GTEST_FLAG(catch_exceptions), but only before + // RUN_ALL_TESTS() starts. It is technically possible to check the flag + // after the exception is caught and either report or re-throw the + // exception based on the flag's value: + // + // try { + // // Perform the test method. + // } catch (...) { + // if (GTEST_FLAG(catch_exceptions)) + // // Report the exception as failure. + // else + // throw; // Re-throws the original exception. + // } + // + // However, the purpose of this flag is to allow the program to drop into + // the debugger when the exception is thrown. On most platforms, once the + // control enters the catch block, the exception origin information is + // lost and the debugger will stop the program at the point of the + // re-throw in this function -- instead of at the point of the original + // throw statement in the code under test. For this reason, we perform + // the check early, sacrificing the ability to affect Google Test's + // exception handling in the method where the exception is thrown. + if (internal::GetUnitTestImpl()->catch_exceptions()) { +#if GTEST_HAS_EXCEPTIONS + try { + return HandleSehExceptionsInMethodIfSupported(object, method, location); + } catch (const internal::GoogleTestFailureException&) { // NOLINT + // This exception type can only be thrown by a failed Google + // Test assertion with the intention of letting another testing + // framework catch it. Therefore we just re-throw it. + throw; + } catch (const std::exception& e) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(e.what(), location)); + } catch (...) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(NULL, location)); + } + return static_cast(0); +#else + return HandleSehExceptionsInMethodIfSupported(object, method, location); +#endif // GTEST_HAS_EXCEPTIONS + } else { + return (object->*method)(); + } +} + +} // namespace internal + +// Runs the test and updates the test result. +void Test::Run() { + if (!HasSameFixtureClass()) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); + // We will run the test only if SetUp() was successful. + if (!HasFatalFailure()) { + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &Test::TestBody, "the test body"); + } + + // However, we want to clean up as much as possible. Hence we will + // always call TearDown(), even if SetUp() or the test body has + // failed. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &Test::TearDown, "TearDown()"); +} + +// Returns true iff the current test has a fatal failure. +bool Test::HasFatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); +} + +// Returns true iff the current test has a non-fatal failure. +bool Test::HasNonfatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()-> + HasNonfatalFailure(); +} + +// class TestInfo + +// Constructs a TestInfo object. It assumes ownership of the test factory +// object. +TestInfo::TestInfo(const std::string& a_test_case_name, + const std::string& a_name, + const char* a_type_param, + const char* a_value_param, + internal::CodeLocation a_code_location, + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory) + : test_case_name_(a_test_case_name), + name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : NULL), + value_param_(a_value_param ? new std::string(a_value_param) : NULL), + location_(a_code_location), + fixture_class_id_(fixture_class_id), + should_run_(false), + is_disabled_(false), + matches_filter_(false), + factory_(factory), + result_() {} + +// Destructs a TestInfo object. +TestInfo::~TestInfo() { delete factory_; } + +namespace internal { + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_case_name: name of the test case +// name: name of the test +// type_param: the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param: text representation of the test's value parameter, +// or NULL if this is not a value-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +TestInfo* MakeAndRegisterTestInfo( + const char* test_case_name, + const char* name, + const char* type_param, + const char* value_param, + CodeLocation code_location, + TypeId fixture_class_id, + SetUpTestCaseFunc set_up_tc, + TearDownTestCaseFunc tear_down_tc, + TestFactoryBase* factory) { + TestInfo* const test_info = + new TestInfo(test_case_name, name, type_param, value_param, + code_location, fixture_class_id, factory); + GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); + return test_info; +} + +#if GTEST_HAS_PARAM_TEST +void ReportInvalidTestCaseType(const char* test_case_name, + CodeLocation code_location) { + Message errors; + errors + << "Attempted redefinition of test case " << test_case_name << ".\n" + << "All tests in the same test case must use the same test fixture\n" + << "class. However, in test case " << test_case_name << ", you tried\n" + << "to define a test using a fixture class different from the one\n" + << "used earlier. This can happen if the two fixture classes are\n" + << "from different namespaces and have the same name. You should\n" + << "probably rename one of the classes to put the tests into different\n" + << "test cases."; + + fprintf(stderr, "%s %s", + FormatFileLocation(code_location.file.c_str(), + code_location.line).c_str(), + errors.GetString().c_str()); +} +#endif // GTEST_HAS_PARAM_TEST + +} // namespace internal + +namespace { + +// A predicate that checks the test name of a TestInfo against a known +// value. +// +// This is used for implementation of the TestCase class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestNameIs is copyable. +class TestNameIs { + public: + // Constructor. + // + // TestNameIs has NO default constructor. + explicit TestNameIs(const char* name) + : name_(name) {} + + // Returns true iff the test name of test_info matches name_. + bool operator()(const TestInfo * test_info) const { + return test_info && test_info->name() == name_; + } + + private: + std::string name_; +}; + +} // namespace + +namespace internal { + +// This method expands all parameterized tests registered with macros TEST_P +// and INSTANTIATE_TEST_CASE_P into regular tests and registers those. +// This will be done just once during the program runtime. +void UnitTestImpl::RegisterParameterizedTests() { +#if GTEST_HAS_PARAM_TEST + if (!parameterized_tests_registered_) { + parameterized_test_registry_.RegisterTests(); + parameterized_tests_registered_ = true; + } +#endif +} + +} // namespace internal + +// Creates the test object, runs it, records its result, and then +// deletes it. +void TestInfo::Run() { + if (!should_run_) return; + + // Tells UnitTest where to store test result. + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + + const TimeInMillis start = internal::GetTimeInMillis(); + + impl->os_stack_trace_getter()->UponLeavingGTest(); + + // Creates the test object. + Test* const test = internal::HandleExceptionsInMethodIfSupported( + factory_, &internal::TestFactoryBase::CreateTest, + "the test fixture's constructor"); + + // Runs the test only if the test object was created and its + // constructor didn't generate a fatal failure. + if ((test != NULL) && !Test::HasFatalFailure()) { + // This doesn't throw as all user code that can throw are wrapped into + // exception handling code. + test->Run(); + } + + // Deletes the test object. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + test, &Test::DeleteSelf_, "the test fixture's destructor"); + + result_.set_elapsed_time(internal::GetTimeInMillis() - start); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + + // Tells UnitTest to stop associating assertion results to this + // test. + impl->set_current_test_info(NULL); +} + +// class TestCase + +// Gets the number of successful tests in this test case. +int TestCase::successful_test_count() const { + return CountIf(test_info_list_, TestPassed); +} + +// Gets the number of failed tests in this test case. +int TestCase::failed_test_count() const { + return CountIf(test_info_list_, TestFailed); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int TestCase::reportable_disabled_test_count() const { + return CountIf(test_info_list_, TestReportableDisabled); +} + +// Gets the number of disabled tests in this test case. +int TestCase::disabled_test_count() const { + return CountIf(test_info_list_, TestDisabled); +} + +// Gets the number of tests to be printed in the XML report. +int TestCase::reportable_test_count() const { + return CountIf(test_info_list_, TestReportable); +} + +// Get the number of tests in this test case that should run. +int TestCase::test_to_run_count() const { + return CountIf(test_info_list_, ShouldRunTest); +} + +// Gets the number of all tests. +int TestCase::total_test_count() const { + return static_cast(test_info_list_.size()); +} + +// Creates a TestCase with the given name. +// +// Arguments: +// +// name: name of the test case +// a_type_param: the name of the test case's type parameter, or NULL if +// this is not a typed or a type-parameterized test case. +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +TestCase::TestCase(const char* a_name, const char* a_type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc) + : name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : NULL), + set_up_tc_(set_up_tc), + tear_down_tc_(tear_down_tc), + should_run_(false), + elapsed_time_(0) { +} + +// Destructor of TestCase. +TestCase::~TestCase() { + // Deletes every Test in the collection. + ForEach(test_info_list_, internal::Delete); +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +const TestInfo* TestCase::GetTestInfo(int i) const { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? NULL : test_info_list_[index]; +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +TestInfo* TestCase::GetMutableTestInfo(int i) { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? NULL : test_info_list_[index]; +} + +// Adds a test to this test case. Will delete the test upon +// destruction of the TestCase object. +void TestCase::AddTestInfo(TestInfo * test_info) { + test_info_list_.push_back(test_info); + test_indices_.push_back(static_cast(test_indices_.size())); +} + +// Runs every test in this TestCase. +void TestCase::Run() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_case(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + repeater->OnTestCaseStart(*this); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestCase::RunSetUpTestCase, "SetUpTestCase()"); + + const internal::TimeInMillis start = internal::GetTimeInMillis(); + for (int i = 0; i < total_test_count(); i++) { + GetMutableTestInfo(i)->Run(); + } + elapsed_time_ = internal::GetTimeInMillis() - start; + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestCase::RunTearDownTestCase, "TearDownTestCase()"); + + repeater->OnTestCaseEnd(*this); + impl->set_current_test_case(NULL); +} + +// Clears the results of all tests in this test case. +void TestCase::ClearResult() { + ad_hoc_test_result_.Clear(); + ForEach(test_info_list_, TestInfo::ClearTestResult); +} + +// Shuffles the tests in this test case. +void TestCase::ShuffleTests(internal::Random* random) { + Shuffle(random, &test_indices_); +} + +// Restores the test order to before the first shuffle. +void TestCase::UnshuffleTests() { + for (size_t i = 0; i < test_indices_.size(); i++) { + test_indices_[i] = static_cast(i); + } +} + +// Formats a countable noun. Depending on its quantity, either the +// singular form or the plural form is used. e.g. +// +// FormatCountableNoun(1, "formula", "formuli") returns "1 formula". +// FormatCountableNoun(5, "book", "books") returns "5 books". +static std::string FormatCountableNoun(int count, + const char * singular_form, + const char * plural_form) { + return internal::StreamableToString(count) + " " + + (count == 1 ? singular_form : plural_form); +} + +// Formats the count of tests. +static std::string FormatTestCount(int test_count) { + return FormatCountableNoun(test_count, "test", "tests"); +} + +// Formats the count of test cases. +static std::string FormatTestCaseCount(int test_case_count) { + return FormatCountableNoun(test_case_count, "test case", "test cases"); +} + +// Converts a TestPartResult::Type enum to human-friendly string +// representation. Both kNonFatalFailure and kFatalFailure are translated +// to "Failure", as the user usually doesn't care about the difference +// between the two when viewing the test result. +static const char * TestPartResultTypeToString(TestPartResult::Type type) { + switch (type) { + case TestPartResult::kSuccess: + return "Success"; + + case TestPartResult::kNonFatalFailure: + case TestPartResult::kFatalFailure: +#ifdef _MSC_VER + return "error: "; +#else + return "Failure\n"; +#endif + default: + return "Unknown result type"; + } +} + +namespace internal { + +// Prints a TestPartResult to an std::string. +static std::string PrintTestPartResultToString( + const TestPartResult& test_part_result) { + return (Message() + << internal::FormatFileLocation(test_part_result.file_name(), + test_part_result.line_number()) + << " " << TestPartResultTypeToString(test_part_result.type()) + << test_part_result.message()).GetString(); +} + +// Prints a TestPartResult. +static void PrintTestPartResult(const TestPartResult& test_part_result) { + const std::string& result = + PrintTestPartResultToString(test_part_result); + printf("%s\n", result.c_str()); + fflush(stdout); + // If the test program runs in Visual Studio or a debugger, the + // following statements add the test part result message to the Output + // window such that the user can double-click on it to jump to the + // corresponding source code location; otherwise they do nothing. +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + // We don't call OutputDebugString*() on Windows Mobile, as printing + // to stdout is done by OutputDebugString() there already - we don't + // want the same message printed twice. + ::OutputDebugStringA(result.c_str()); + ::OutputDebugStringA("\n"); +#endif +} + +// class PrettyUnitTestResultPrinter + +enum GTestColor { + COLOR_DEFAULT, + COLOR_RED, + COLOR_GREEN, + COLOR_YELLOW +}; + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ + !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +// Returns the character attribute for the given color. +WORD GetColorAttribute(GTestColor color) { + switch (color) { + case COLOR_RED: return FOREGROUND_RED; + case COLOR_GREEN: return FOREGROUND_GREEN; + case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN; + default: return 0; + } +} + +#else + +// Returns the ANSI color code for the given color. COLOR_DEFAULT is +// an invalid input. +const char* GetAnsiColorCode(GTestColor color) { + switch (color) { + case COLOR_RED: return "1"; + case COLOR_GREEN: return "2"; + case COLOR_YELLOW: return "3"; + default: return NULL; + }; +} + +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + +// Returns true iff Google Test should use colors in the output. +bool ShouldUseColor(bool stdout_is_tty) { + const char* const gtest_color = GTEST_FLAG(color).c_str(); + + if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { +#if GTEST_OS_WINDOWS + // On Windows the TERM variable is usually not set, but the + // console there does support colors. + return stdout_is_tty; +#else + // On non-Windows platforms, we rely on the TERM variable. + const char* const term = posix::GetEnv("TERM"); + const bool term_supports_color = + String::CStringEquals(term, "xterm") || + String::CStringEquals(term, "xterm-color") || + String::CStringEquals(term, "xterm-256color") || + String::CStringEquals(term, "screen") || + String::CStringEquals(term, "screen-256color") || + String::CStringEquals(term, "tmux") || + String::CStringEquals(term, "tmux-256color") || + String::CStringEquals(term, "rxvt-unicode") || + String::CStringEquals(term, "rxvt-unicode-256color") || + String::CStringEquals(term, "linux") || + String::CStringEquals(term, "cygwin"); + return stdout_is_tty && term_supports_color; +#endif // GTEST_OS_WINDOWS + } + + return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || + String::CaseInsensitiveCStringEquals(gtest_color, "true") || + String::CaseInsensitiveCStringEquals(gtest_color, "t") || + String::CStringEquals(gtest_color, "1"); + // We take "yes", "true", "t", and "1" as meaning "yes". If the + // value is neither one of these nor "auto", we treat it as "no" to + // be conservative. +} + +// Helpers for printing colored strings to stdout. Note that on Windows, we +// cannot simply emit special characters and have the terminal change colors. +// This routine must actually emit the characters rather than return a string +// that would be colored when printed, as can be done on Linux. +void ColoredPrintf(GTestColor color, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || \ + GTEST_OS_IOS || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT + const bool use_color = AlwaysFalse(); +#else + static const bool in_color_mode = + ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); + const bool use_color = in_color_mode && (color != COLOR_DEFAULT); +#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS + // The '!= 0' comparison is necessary to satisfy MSVC 7.1. + + if (!use_color) { + vprintf(fmt, args); + va_end(args); + return; + } + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ + !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); + + // Gets the current text color. + CONSOLE_SCREEN_BUFFER_INFO buffer_info; + GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); + const WORD old_color_attrs = buffer_info.wAttributes; + + // We need to flush the stream buffers into the console before each + // SetConsoleTextAttribute call lest it affect the text that is already + // printed but has not yet reached the console. + fflush(stdout); + SetConsoleTextAttribute(stdout_handle, + GetColorAttribute(color) | FOREGROUND_INTENSITY); + vprintf(fmt, args); + + fflush(stdout); + // Restores the text color. + SetConsoleTextAttribute(stdout_handle, old_color_attrs); +#else + printf("\033[0;3%sm", GetAnsiColorCode(color)); + vprintf(fmt, args); + printf("\033[m"); // Resets the terminal to default. +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + va_end(args); +} + +// Text printed in Google Test's text output and --gunit_list_tests +// output to label the type parameter and value parameter for a test. +static const char kTypeParamLabel[] = "TypeParam"; +static const char kValueParamLabel[] = "GetParam()"; + +void PrintFullTestCommentIfPresent(const TestInfo& test_info) { + const char* const type_param = test_info.type_param(); + const char* const value_param = test_info.value_param(); + + if (type_param != NULL || value_param != NULL) { + printf(", where "); + if (type_param != NULL) { + printf("%s = %s", kTypeParamLabel, type_param); + if (value_param != NULL) + printf(" and "); + } + if (value_param != NULL) { + printf("%s = %s", kValueParamLabel, value_param); + } + } +} + +// This class implements the TestEventListener interface. +// +// Class PrettyUnitTestResultPrinter is copyable. +class PrettyUnitTestResultPrinter : public TestEventListener { + public: + PrettyUnitTestResultPrinter() {} + static void PrintTestName(const char * test_case, const char * test) { + printf("%s.%s", test_case, test); + } + + // The following methods override what's in the TestEventListener class. + virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); + virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestCaseStart(const TestCase& test_case); + virtual void OnTestStart(const TestInfo& test_info); + virtual void OnTestPartResult(const TestPartResult& result); + virtual void OnTestEnd(const TestInfo& test_info); + virtual void OnTestCaseEnd(const TestCase& test_case); + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); + virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} + + private: + static void PrintFailedTests(const UnitTest& unit_test); +}; + + // Fired before each iteration of tests starts. +void PrettyUnitTestResultPrinter::OnTestIterationStart( + const UnitTest& unit_test, int iteration) { + if (GTEST_FLAG(repeat) != 1) + printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); + + const char* const filter = GTEST_FLAG(filter).c_str(); + + // Prints the filter if it's not *. This reminds the user that some + // tests may be skipped. + if (!String::CStringEquals(filter, kUniversalFilter)) { + ColoredPrintf(COLOR_YELLOW, + "Note: %s filter = %s\n", GTEST_NAME_, filter); + } + + if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { + const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); + ColoredPrintf(COLOR_YELLOW, + "Note: This is test shard %d of %s.\n", + static_cast(shard_index) + 1, + internal::posix::GetEnv(kTestTotalShards)); + } + + if (GTEST_FLAG(shuffle)) { + ColoredPrintf(COLOR_YELLOW, + "Note: Randomizing tests' orders with a seed of %d .\n", + unit_test.random_seed()); + } + + ColoredPrintf(COLOR_GREEN, "[==========] "); + printf("Running %s from %s.\n", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("Global test environment set-up.\n"); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s", counts.c_str(), test_case.name()); + if (test_case.type_param() == NULL) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param()); + } + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { + ColoredPrintf(COLOR_GREEN, "[ RUN ] "); + PrintTestName(test_info.test_case_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + +// Called after an assertion failure. +void PrettyUnitTestResultPrinter::OnTestPartResult( + const TestPartResult& result) { + // If the test part succeeded, we don't need to do anything. + if (result.type() == TestPartResult::kSuccess) + return; + + // Print failure message from the assertion (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { + if (test_info.result()->Passed()) { + ColoredPrintf(COLOR_GREEN, "[ OK ] "); + } else { + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + } + PrintTestName(test_info.test_case_name(), test_info.name()); + if (test_info.result()->Failed()) + PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG(print_time)) { + printf(" (%s ms)\n", internal::StreamableToString( + test_info.result()->elapsed_time()).c_str()); + } else { + printf("\n"); + } + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { + if (!GTEST_FLAG(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s (%s ms total)\n\n", + counts.c_str(), test_case.name(), + internal::StreamableToString(test_case.elapsed_time()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("Global test environment tear-down\n"); + fflush(stdout); +} + +// Internal helper for printing the list of failed tests. +void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { + const int failed_test_count = unit_test.failed_test_count(); + if (failed_test_count == 0) { + return; + } + + for (int i = 0; i < unit_test.total_test_case_count(); ++i) { + const TestCase& test_case = *unit_test.GetTestCase(i); + if (!test_case.should_run() || (test_case.failed_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_case.total_test_count(); ++j) { + const TestInfo& test_info = *test_case.GetTestInfo(j); + if (!test_info.should_run() || test_info.result()->Passed()) { + continue; + } + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s.%s", test_case.name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + printf("\n"); + } + } +} + +void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + ColoredPrintf(COLOR_GREEN, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); + if (GTEST_FLAG(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(COLOR_GREEN, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + int num_failures = unit_test.failed_test_count(); + if (!unit_test.Passed()) { + const int failed_test_count = unit_test.failed_test_count(); + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); + PrintFailedTests(unit_test); + printf("\n%2d FAILED %s\n", num_failures, + num_failures == 1 ? "TEST" : "TESTS"); + } + + int num_disabled = unit_test.reportable_disabled_test_count(); + if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (!num_failures) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(COLOR_YELLOW, + " YOU HAVE %d DISABLED %s\n\n", + num_disabled, + num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End PrettyUnitTestResultPrinter + +// class TestEventRepeater +// +// This class forwards events to other event listeners. +class TestEventRepeater : public TestEventListener { + public: + TestEventRepeater() : forwarding_enabled_(true) {} + virtual ~TestEventRepeater(); + void Append(TestEventListener *listener); + TestEventListener* Release(TestEventListener* listener); + + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled() const { return forwarding_enabled_; } + void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } + + virtual void OnTestProgramStart(const UnitTest& unit_test); + virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); + virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test); + virtual void OnTestCaseStart(const TestCase& test_case); + virtual void OnTestStart(const TestInfo& test_info); + virtual void OnTestPartResult(const TestPartResult& result); + virtual void OnTestEnd(const TestInfo& test_info); + virtual void OnTestCaseEnd(const TestCase& test_case); + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); + virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test); + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + virtual void OnTestProgramEnd(const UnitTest& unit_test); + + private: + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled_; + // The list of listeners that receive events. + std::vector listeners_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater); +}; + +TestEventRepeater::~TestEventRepeater() { + ForEach(listeners_, Delete); +} + +void TestEventRepeater::Append(TestEventListener *listener) { + listeners_.push_back(listener); +} + +// TODO(vladl@google.com): Factor the search functionality into Vector::Find. +TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { + for (size_t i = 0; i < listeners_.size(); ++i) { + if (listeners_[i] == listener) { + listeners_.erase(listeners_.begin() + i); + return listener; + } + } + + return NULL; +} + +// Since most methods are very similar, use macros to reduce boilerplate. +// This defines a member that forwards the call to all listeners. +#define GTEST_REPEATER_METHOD_(Name, Type) \ +void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = 0; i < listeners_.size(); i++) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ +} +// This defines a member that forwards the call to all listeners in reverse +// order. +#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ +void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ +} + +GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) +GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) +GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase) +GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) +GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) +GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) +GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase) +GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) + +#undef GTEST_REPEATER_METHOD_ +#undef GTEST_REVERSE_REPEATER_METHOD_ + +void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = 0; i < listeners_.size(); i++) { + listeners_[i]->OnTestIterationStart(unit_test, iteration); + } + } +} + +void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { + listeners_[i]->OnTestIterationEnd(unit_test, iteration); + } + } +} + +// End TestEventRepeater + +// This class generates an XML output file. +class XmlUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit XmlUnitTestResultPrinter(const char* output_file); + + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + + private: + // Is c a whitespace character that is normalized to a space character + // when it appears in an XML attribute value? + static bool IsNormalizableWhitespace(char c) { + return c == 0x9 || c == 0xA || c == 0xD; + } + + // May c appear in a well-formed XML document? + static bool IsValidXmlCharacter(char c) { + return IsNormalizableWhitespace(c) || c >= 0x20; + } + + // Returns an XML-escaped copy of the input string str. If + // is_attribute is true, the text is meant to appear as an attribute + // value, and normalizable whitespace is preserved by replacing it + // with character references. + static std::string EscapeXml(const std::string& str, bool is_attribute); + + // Returns the given string with all characters invalid in XML removed. + static std::string RemoveInvalidXmlCharacters(const std::string& str); + + // Convenience wrapper around EscapeXml when str is an attribute value. + static std::string EscapeXmlAttribute(const std::string& str) { + return EscapeXml(str, true); + } + + // Convenience wrapper around EscapeXml when str is not an attribute value. + static std::string EscapeXmlText(const char* str) { + return EscapeXml(str, false); + } + + // Verifies that the given attribute belongs to the given element and + // streams the attribute as XML. + static void OutputXmlAttribute(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value); + + // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. + static void OutputXmlCDataSection(::std::ostream* stream, const char* data); + + // Streams an XML representation of a TestInfo object. + static void OutputXmlTestInfo(::std::ostream* stream, + const char* test_case_name, + const TestInfo& test_info); + + // Prints an XML representation of a TestCase object + static void PrintXmlTestCase(::std::ostream* stream, + const TestCase& test_case); + + // Prints an XML summary of unit_test to output stream out. + static void PrintXmlUnitTest(::std::ostream* stream, + const UnitTest& unit_test); + + // Produces a string representing the test properties in a result as space + // delimited XML attributes based on the property key="value" pairs. + // When the std::string is not empty, it includes a space at the beginning, + // to delimit this attribute from prior attributes. + static std::string TestPropertiesAsXmlAttributes(const TestResult& result); + + // The output file. + const std::string output_file_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter); +}; + +// Creates a new XmlUnitTestResultPrinter. +XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file) + : output_file_(output_file) { + if (output_file_.c_str() == NULL || output_file_.empty()) { + fprintf(stderr, "XML output file may not be null\n"); + fflush(stderr); + exit(EXIT_FAILURE); + } +} + +// Called after the unit test ends. +void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + FILE* xmlout = NULL; + FilePath output_file(output_file_); + FilePath output_dir(output_file.RemoveFileName()); + + if (output_dir.CreateDirectoriesRecursively()) { + xmlout = posix::FOpen(output_file_.c_str(), "w"); + } + if (xmlout == NULL) { + // TODO(wan): report the reason of the failure. + // + // We don't do it for now as: + // + // 1. There is no urgent need for it. + // 2. It's a bit involved to make the errno variable thread-safe on + // all three operating systems (Linux, Windows, and Mac OS). + // 3. To interpret the meaning of errno in a thread-safe way, + // we need the strerror_r() function, which is not available on + // Windows. + fprintf(stderr, + "Unable to open file \"%s\"\n", + output_file_.c_str()); + fflush(stderr); + exit(EXIT_FAILURE); + } + std::stringstream stream; + PrintXmlUnitTest(&stream, unit_test); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + +// Returns an XML-escaped copy of the input string str. If is_attribute +// is true, the text is meant to appear as an attribute value, and +// normalizable whitespace is preserved by replacing it with character +// references. +// +// Invalid XML characters in str, if any, are stripped from the output. +// It is expected that most, if not all, of the text processed by this +// module will consist of ordinary English text. +// If this module is ever modified to produce version 1.1 XML output, +// most invalid characters can be retained using character references. +// TODO(wan): It might be nice to have a minimally invasive, human-readable +// escaping scheme for invalid characters, rather than dropping them. +std::string XmlUnitTestResultPrinter::EscapeXml( + const std::string& str, bool is_attribute) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '<': + m << "<"; + break; + case '>': + m << ">"; + break; + case '&': + m << "&"; + break; + case '\'': + if (is_attribute) + m << "'"; + else + m << '\''; + break; + case '"': + if (is_attribute) + m << """; + else + m << '"'; + break; + default: + if (IsValidXmlCharacter(ch)) { + if (is_attribute && IsNormalizableWhitespace(ch)) + m << "&#x" << String::FormatByte(static_cast(ch)) + << ";"; + else + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// Returns the given string with all characters invalid in XML removed. +// Currently invalid characters are dropped from the string. An +// alternative is to replace them with certain characters such as . or ?. +std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( + const std::string& str) { + std::string output; + output.reserve(str.size()); + for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) + if (IsValidXmlCharacter(*it)) + output.push_back(*it); + + return output; +} + +// The following routines generate an XML representation of a UnitTest +// object. +// +// This is how Google Test concepts map to the DTD: +// +// <-- corresponds to a UnitTest object +// <-- corresponds to a TestCase object +// <-- corresponds to a TestInfo object +// ... +// ... +// ... +// <-- individual assertion failures +// +// +// + +// Formats the given time in milliseconds as seconds. +std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) { + ::std::stringstream ss; + ss << (static_cast(ms) * 1e-3); + return ss.str(); +} + +static bool PortableLocaltime(time_t seconds, struct tm* out) { +#if defined(_MSC_VER) + return localtime_s(out, &seconds) == 0; +#elif defined(__MINGW32__) || defined(__MINGW64__) + // MINGW provides neither localtime_r nor localtime_s, but uses + // Windows' localtime(), which has a thread-local tm buffer. + struct tm* tm_ptr = localtime(&seconds); // NOLINT + if (tm_ptr == NULL) + return false; + *out = *tm_ptr; + return true; +#else + return localtime_r(&seconds, out) != NULL; +#endif +} + +// Converts the given epoch time in milliseconds to a date string in the ISO +// 8601 format, without the timezone information. +std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { + struct tm time_struct; + if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) + return ""; + // YYYY-MM-DDThh:mm:ss + return StreamableToString(time_struct.tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec); +} + +// Streams an XML CDATA section, escaping invalid CDATA sequences as needed. +void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, + const char* data) { + const char* segment = data; + *stream << ""); + if (next_segment != NULL) { + stream->write( + segment, static_cast(next_segment - segment)); + *stream << "]]>]]>"); + } else { + *stream << segment; + break; + } + } + *stream << "]]>"; +} + +void XmlUnitTestResultPrinter::OutputXmlAttribute( + std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value) { + const std::vector& allowed_names = + GetReservedAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Attribute " << name << " is not allowed for element <" << element_name + << ">."; + + *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\""; +} + +// Prints an XML representation of a TestInfo object. +// TODO(wan): There is also value in printing properties with the plain printer. +void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, + const char* test_case_name, + const TestInfo& test_info) { + const TestResult& result = *test_info.result(); + const std::string kTestcase = "testcase"; + + *stream << " \n"; + } + const string location = internal::FormatCompilerIndependentFileLocation( + part.file_name(), part.line_number()); + const string summary = location + "\n" + part.summary(); + *stream << " "; + const string detail = location + "\n" + part.message(); + OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); + *stream << "\n"; + } + } + + if (failures == 0) + *stream << " />\n"; + else + *stream << " \n"; +} + +// Prints an XML representation of a TestCase object +void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream, + const TestCase& test_case) { + const std::string kTestsuite = "testsuite"; + *stream << " <" << kTestsuite; + OutputXmlAttribute(stream, kTestsuite, "name", test_case.name()); + OutputXmlAttribute(stream, kTestsuite, "tests", + StreamableToString(test_case.reportable_test_count())); + OutputXmlAttribute(stream, kTestsuite, "failures", + StreamableToString(test_case.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuite, "disabled", + StreamableToString(test_case.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuite, "errors", "0"); + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(test_case.elapsed_time())); + *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result()) + << ">\n"; + + for (int i = 0; i < test_case.total_test_count(); ++i) { + if (test_case.GetTestInfo(i)->is_reportable()) + OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i)); + } + *stream << " \n"; +} + +// Prints an XML summary of unit_test to output stream out. +void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, + const UnitTest& unit_test) { + const std::string kTestsuites = "testsuites"; + + *stream << "\n"; + *stream << "<" << kTestsuites; + + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(unit_test.reportable_test_count())); + OutputXmlAttribute(stream, kTestsuites, "failures", + StreamableToString(unit_test.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuites, "disabled", + StreamableToString(unit_test.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuites, "errors", "0"); + OutputXmlAttribute( + stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); + OutputXmlAttribute(stream, kTestsuites, "time", + FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); + + if (GTEST_FLAG(shuffle)) { + OutputXmlAttribute(stream, kTestsuites, "random_seed", + StreamableToString(unit_test.random_seed())); + } + + *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result()); + + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (int i = 0; i < unit_test.total_test_case_count(); ++i) { + if (unit_test.GetTestCase(i)->reportable_test_count() > 0) + PrintXmlTestCase(stream, *unit_test.GetTestCase(i)); + } + *stream << "\n"; +} + +// Produces a string representing the test properties in a result as space +// delimited XML attributes based on the property key="value" pairs. +std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( + const TestResult& result) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + attributes << " " << property.key() << "=" + << "\"" << EscapeXmlAttribute(property.value()) << "\""; + } + return attributes.GetString(); +} + +// End XmlUnitTestResultPrinter + +#if GTEST_CAN_STREAM_RESULTS_ + +// Checks if str contains '=', '&', '%' or '\n' characters. If yes, +// replaces them by "%xx" where xx is their hexadecimal value. For +// example, replaces "=" with "%3D". This algorithm is O(strlen(str)) +// in both time and space -- important as the input str may contain an +// arbitrarily long test failure message and stack trace. +string StreamingListener::UrlEncode(const char* str) { + string result; + result.reserve(strlen(str) + 1); + for (char ch = *str; ch != '\0'; ch = *++str) { + switch (ch) { + case '%': + case '=': + case '&': + case '\n': + result.append("%" + String::FormatByte(static_cast(ch))); + break; + default: + result.push_back(ch); + break; + } + } + return result; +} + +void StreamingListener::SocketWriter::MakeConnection() { + GTEST_CHECK_(sockfd_ == -1) + << "MakeConnection() can't be called when there is already a connection."; + + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. + hints.ai_socktype = SOCK_STREAM; + addrinfo* servinfo = NULL; + + // Use the getaddrinfo() to get a linked list of IP addresses for + // the given host name. + const int error_num = getaddrinfo( + host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); + if (error_num != 0) { + GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " + << gai_strerror(error_num); + } + + // Loop through all the results and connect to the first we can. + for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL; + cur_addr = cur_addr->ai_next) { + sockfd_ = socket( + cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol); + if (sockfd_ != -1) { + // Connect the client socket to the server socket. + if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { + close(sockfd_); + sockfd_ = -1; + } + } + } + + freeaddrinfo(servinfo); // all done with this structure + + if (sockfd_ == -1) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to " + << host_name_ << ":" << port_num_; + } +} + +// End of class Streaming Listener +#endif // GTEST_CAN_STREAM_RESULTS__ + +// Class ScopedTrace + +// Pushes the given source file location and message onto a per-thread +// trace stack maintained by Google Test. +ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) + GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { + TraceInfo trace; + trace.file = file; + trace.line = line; + trace.message = message.GetString(); + + UnitTest::GetInstance()->PushGTestTrace(trace); +} + +// Pops the info pushed by the c'tor. +ScopedTrace::~ScopedTrace() + GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { + UnitTest::GetInstance()->PopGTestTrace(); +} + + +// class OsStackTraceGetter + +const char* const OsStackTraceGetterInterface::kElidedFramesMarker = + "... " GTEST_NAME_ " internal frames ..."; + +string OsStackTraceGetter::CurrentStackTrace(int /*max_depth*/, + int /*skip_count*/) { + return ""; +} + +void OsStackTraceGetter::UponLeavingGTest() {} + +// A helper class that creates the premature-exit file in its +// constructor and deletes the file in its destructor. +class ScopedPrematureExitFile { + public: + explicit ScopedPrematureExitFile(const char* premature_exit_filepath) + : premature_exit_filepath_(premature_exit_filepath) { + // If a path to the premature-exit file is specified... + if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') { + // create the file with a single "0" character in it. I/O + // errors are ignored as there's nothing better we can do and we + // don't want to fail the test because of this. + FILE* pfile = posix::FOpen(premature_exit_filepath, "w"); + fwrite("0", 1, 1, pfile); + fclose(pfile); + } + } + + ~ScopedPrematureExitFile() { + if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') { + remove(premature_exit_filepath_); + } + } + + private: + const char* const premature_exit_filepath_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile); +}; + +} // namespace internal + +// class TestEventListeners + +TestEventListeners::TestEventListeners() + : repeater_(new internal::TestEventRepeater()), + default_result_printer_(NULL), + default_xml_generator_(NULL) { +} + +TestEventListeners::~TestEventListeners() { delete repeater_; } + +// Returns the standard listener responsible for the default console +// output. Can be removed from the listeners list to shut down default +// console output. Note that removing this object from the listener list +// with Release transfers its ownership to the user. +void TestEventListeners::Append(TestEventListener* listener) { + repeater_->Append(listener); +} + +// Removes the given event listener from the list and returns it. It then +// becomes the caller's responsibility to delete the listener. Returns +// NULL if the listener is not found in the list. +TestEventListener* TestEventListeners::Release(TestEventListener* listener) { + if (listener == default_result_printer_) + default_result_printer_ = NULL; + else if (listener == default_xml_generator_) + default_xml_generator_ = NULL; + return repeater_->Release(listener); +} + +// Returns repeater that broadcasts the TestEventListener events to all +// subscribers. +TestEventListener* TestEventListeners::repeater() { return repeater_; } + +// Sets the default_result_printer attribute to the provided listener. +// The listener is also added to the listener list and previous +// default_result_printer is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) { + if (default_result_printer_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_result_printer_); + default_result_printer_ = listener; + if (listener != NULL) + Append(listener); + } +} + +// Sets the default_xml_generator attribute to the provided listener. The +// listener is also added to the listener list and previous +// default_xml_generator is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) { + if (default_xml_generator_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_xml_generator_); + default_xml_generator_ = listener; + if (listener != NULL) + Append(listener); + } +} + +// Controls whether events will be forwarded by the repeater to the +// listeners in the list. +bool TestEventListeners::EventForwardingEnabled() const { + return repeater_->forwarding_enabled(); +} + +void TestEventListeners::SuppressEventForwarding() { + repeater_->set_forwarding_enabled(false); +} + +// class UnitTest + +// Gets the singleton UnitTest object. The first time this method is +// called, a UnitTest object is constructed and returned. Consecutive +// calls will return the same object. +// +// We don't protect this under mutex_ as a user is not supposed to +// call this before main() starts, from which point on the return +// value will never change. +UnitTest* UnitTest::GetInstance() { + // When compiled with MSVC 7.1 in optimized mode, destroying the + // UnitTest object upon exiting the program messes up the exit code, + // causing successful tests to appear failed. We have to use a + // different implementation in this case to bypass the compiler bug. + // This implementation makes the compiler happy, at the cost of + // leaking the UnitTest object. + + // CodeGear C++Builder insists on a public destructor for the + // default implementation. Use this implementation to keep good OO + // design with private destructor. + +#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) + static UnitTest* const instance = new UnitTest; + return instance; +#else + static UnitTest instance; + return &instance; +#endif // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) +} + +// Gets the number of successful test cases. +int UnitTest::successful_test_case_count() const { + return impl()->successful_test_case_count(); +} + +// Gets the number of failed test cases. +int UnitTest::failed_test_case_count() const { + return impl()->failed_test_case_count(); +} + +// Gets the number of all test cases. +int UnitTest::total_test_case_count() const { + return impl()->total_test_case_count(); +} + +// Gets the number of all test cases that contain at least one test +// that should run. +int UnitTest::test_case_to_run_count() const { + return impl()->test_case_to_run_count(); +} + +// Gets the number of successful tests. +int UnitTest::successful_test_count() const { + return impl()->successful_test_count(); +} + +// Gets the number of failed tests. +int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTest::reportable_disabled_test_count() const { + return impl()->reportable_disabled_test_count(); +} + +// Gets the number of disabled tests. +int UnitTest::disabled_test_count() const { + return impl()->disabled_test_count(); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTest::reportable_test_count() const { + return impl()->reportable_test_count(); +} + +// Gets the number of all tests. +int UnitTest::total_test_count() const { return impl()->total_test_count(); } + +// Gets the number of tests that should run. +int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } + +// Gets the time of the test program start, in ms from the start of the +// UNIX epoch. +internal::TimeInMillis UnitTest::start_timestamp() const { + return impl()->start_timestamp(); +} + +// Gets the elapsed time, in milliseconds. +internal::TimeInMillis UnitTest::elapsed_time() const { + return impl()->elapsed_time(); +} + +// Returns true iff the unit test passed (i.e. all test cases passed). +bool UnitTest::Passed() const { return impl()->Passed(); } + +// Returns true iff the unit test failed (i.e. some test case failed +// or something outside of all tests failed). +bool UnitTest::Failed() const { return impl()->Failed(); } + +// Gets the i-th test case among all the test cases. i can range from 0 to +// total_test_case_count() - 1. If i is not in that range, returns NULL. +const TestCase* UnitTest::GetTestCase(int i) const { + return impl()->GetTestCase(i); +} + +// Returns the TestResult containing information on test failures and +// properties logged outside of individual test cases. +const TestResult& UnitTest::ad_hoc_test_result() const { + return *impl()->ad_hoc_test_result(); +} + +// Gets the i-th test case among all the test cases. i can range from 0 to +// total_test_case_count() - 1. If i is not in that range, returns NULL. +TestCase* UnitTest::GetMutableTestCase(int i) { + return impl()->GetMutableTestCase(i); +} + +// Returns the list of event listeners that can be used to track events +// inside Google Test. +TestEventListeners& UnitTest::listeners() { + return *impl()->listeners(); +} + +// Registers and returns a global test environment. When a test +// program is run, all global test environments will be set-up in the +// order they were registered. After all tests in the program have +// finished, all global test environments will be torn-down in the +// *reverse* order they were registered. +// +// The UnitTest object takes ownership of the given environment. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +Environment* UnitTest::AddEnvironment(Environment* env) { + if (env == NULL) { + return NULL; + } + + impl_->environments().push_back(env); + return env; +} + +// Adds a TestPartResult to the current TestResult object. All Google Test +// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call +// this to report their results. The user code should use the +// assertion macros instead of calling this directly. +void UnitTest::AddTestPartResult( + TestPartResult::Type result_type, + const char* file_name, + int line_number, + const std::string& message, + const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) { + Message msg; + msg << message; + + internal::MutexLock lock(&mutex_); + if (impl_->gtest_trace_stack().size() > 0) { + msg << "\n" << GTEST_NAME_ << " trace:"; + + for (int i = static_cast(impl_->gtest_trace_stack().size()); + i > 0; --i) { + const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; + msg << "\n" << internal::FormatFileLocation(trace.file, trace.line) + << " " << trace.message; + } + } + + if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) { + msg << internal::kStackTraceMarker << os_stack_trace; + } + + const TestPartResult result = + TestPartResult(result_type, file_name, line_number, + msg.GetString().c_str()); + impl_->GetTestPartResultReporterForCurrentThread()-> + ReportTestPartResult(result); + + if (result_type != TestPartResult::kSuccess) { + // gtest_break_on_failure takes precedence over + // gtest_throw_on_failure. This allows a user to set the latter + // in the code (perhaps in order to use Google Test assertions + // with another testing framework) and specify the former on the + // command line for debugging. + if (GTEST_FLAG(break_on_failure)) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + // Using DebugBreak on Windows allows gtest to still break into a debugger + // when a failure happens and both the --gtest_break_on_failure and + // the --gtest_catch_exceptions flags are specified. + DebugBreak(); +#else + // Dereference NULL through a volatile pointer to prevent the compiler + // from removing. We use this rather than abort() or __builtin_trap() for + // portability: Symbian doesn't implement abort() well, and some debuggers + // don't correctly trap abort(). + *static_cast(NULL) = 1; +#endif // GTEST_OS_WINDOWS + } else if (GTEST_FLAG(throw_on_failure)) { +#if GTEST_HAS_EXCEPTIONS + throw internal::GoogleTestFailureException(result); +#else + // We cannot call abort() as it generates a pop-up in debug mode + // that cannot be suppressed in VC 7.1 or below. + exit(1); +#endif + } + } +} + +// Adds a TestProperty to the current TestResult object when invoked from +// inside a test, to current TestCase's ad_hoc_test_result_ when invoked +// from SetUpTestCase or TearDownTestCase, or to the global property set +// when invoked elsewhere. If the result already contains a property with +// the same key, the value will be updated. +void UnitTest::RecordProperty(const std::string& key, + const std::string& value) { + impl_->RecordProperty(TestProperty(key, value)); +} + +// Runs all tests in this UnitTest object and prints the result. +// Returns 0 if successful, or 1 otherwise. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +int UnitTest::Run() { + const bool in_death_test_child_process = + internal::GTEST_FLAG(internal_run_death_test).length() > 0; + + // Google Test implements this protocol for catching that a test + // program exits before returning control to Google Test: + // + // 1. Upon start, Google Test creates a file whose absolute path + // is specified by the environment variable + // TEST_PREMATURE_EXIT_FILE. + // 2. When Google Test has finished its work, it deletes the file. + // + // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before + // running a Google-Test-based test program and check the existence + // of the file at the end of the test execution to see if it has + // exited prematurely. + + // If we are in the child process of a death test, don't + // create/delete the premature exit file, as doing so is unnecessary + // and will confuse the parent process. Otherwise, create/delete + // the file upon entering/leaving this function. If the program + // somehow exits before this function has a chance to return, the + // premature-exit file will be left undeleted, causing a test runner + // that understands the premature-exit-file protocol to report the + // test as having failed. + const internal::ScopedPrematureExitFile premature_exit_file( + in_death_test_child_process ? + NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); + + // Captures the value of GTEST_FLAG(catch_exceptions). This value will be + // used for the duration of the program. + impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); + +#if GTEST_HAS_SEH + // Either the user wants Google Test to catch exceptions thrown by the + // tests or this is executing in the context of death test child + // process. In either case the user does not want to see pop-up dialogs + // about crashes - they are expected. + if (impl()->catch_exceptions() || in_death_test_child_process) { +# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + // SetErrorMode doesn't exist on CE. + SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | + SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); +# endif // !GTEST_OS_WINDOWS_MOBILE + +# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE + // Death test children can be terminated with _abort(). On Windows, + // _abort() can show a dialog with a warning message. This forces the + // abort message to go to stderr instead. + _set_error_mode(_OUT_TO_STDERR); +# endif + +# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE + // In the debug version, Visual Studio pops up a separate dialog + // offering a choice to debug the aborted program. We need to suppress + // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement + // executed. Google Test will notify the user of any unexpected + // failure via stderr. + // + // VC++ doesn't define _set_abort_behavior() prior to the version 8.0. + // Users of prior VC versions shall suffer the agony and pain of + // clicking through the countless debug dialogs. + // TODO(vladl@google.com): find a way to suppress the abort dialog() in the + // debug mode when compiled with VC 7.1 or lower. + if (!GTEST_FLAG(break_on_failure)) + _set_abort_behavior( + 0x0, // Clear the following flags: + _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. +# endif + } +#endif // GTEST_HAS_SEH + + return internal::HandleExceptionsInMethodIfSupported( + impl(), + &internal::UnitTestImpl::RunAllTests, + "auxiliary test code (environments or event listeners)") ? 0 : 1; +} + +// Returns the working directory when the first TEST() or TEST_F() was +// executed. +const char* UnitTest::original_working_dir() const { + return impl_->original_working_dir_.c_str(); +} + +// Returns the TestCase object for the test that's currently running, +// or NULL if no test is running. +const TestCase* UnitTest::current_test_case() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_case(); +} + +// Returns the TestInfo object for the test that's currently running, +// or NULL if no test is running. +const TestInfo* UnitTest::current_test_info() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_info(); +} + +// Returns the random seed used at the start of the current test run. +int UnitTest::random_seed() const { return impl_->random_seed(); } + +#if GTEST_HAS_PARAM_TEST +// Returns ParameterizedTestCaseRegistry object used to keep track of +// value-parameterized tests and instantiate and register them. +internal::ParameterizedTestCaseRegistry& + UnitTest::parameterized_test_registry() + GTEST_LOCK_EXCLUDED_(mutex_) { + return impl_->parameterized_test_registry(); +} +#endif // GTEST_HAS_PARAM_TEST + +// Creates an empty UnitTest. +UnitTest::UnitTest() { + impl_ = new internal::UnitTestImpl(this); +} + +// Destructor of UnitTest. +UnitTest::~UnitTest() { + delete impl_; +} + +// Pushes a trace defined by SCOPED_TRACE() on to the per-thread +// Google Test trace stack. +void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().push_back(trace); +} + +// Pops a trace from the per-thread Google Test trace stack. +void UnitTest::PopGTestTrace() + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().pop_back(); +} + +namespace internal { + +UnitTestImpl::UnitTestImpl(UnitTest* parent) + : parent_(parent), + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */) + default_global_test_part_result_reporter_(this), + default_per_thread_test_part_result_reporter_(this), + GTEST_DISABLE_MSC_WARNINGS_POP_() + global_test_part_result_repoter_( + &default_global_test_part_result_reporter_), + per_thread_test_part_result_reporter_( + &default_per_thread_test_part_result_reporter_), +#if GTEST_HAS_PARAM_TEST + parameterized_test_registry_(), + parameterized_tests_registered_(false), +#endif // GTEST_HAS_PARAM_TEST + last_death_test_case_(-1), + current_test_case_(NULL), + current_test_info_(NULL), + ad_hoc_test_result_(), + os_stack_trace_getter_(NULL), + post_flag_parse_init_performed_(false), + random_seed_(0), // Will be overridden by the flag before first use. + random_(0), // Will be reseeded before first use. + start_timestamp_(0), + elapsed_time_(0), +#if GTEST_HAS_DEATH_TEST + death_test_factory_(new DefaultDeathTestFactory), +#endif + // Will be overridden by the flag before first use. + catch_exceptions_(false) { + listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter); +} + +UnitTestImpl::~UnitTestImpl() { + // Deletes every TestCase. + ForEach(test_cases_, internal::Delete); + + // Deletes every Environment. + ForEach(environments_, internal::Delete); + + delete os_stack_trace_getter_; +} + +// Adds a TestProperty to the current TestResult object when invoked in a +// context of a test, to current test case's ad_hoc_test_result when invoke +// from SetUpTestCase/TearDownTestCase, or to the global property set +// otherwise. If the result already contains a property with the same key, +// the value will be updated. +void UnitTestImpl::RecordProperty(const TestProperty& test_property) { + std::string xml_element; + TestResult* test_result; // TestResult appropriate for property recording. + + if (current_test_info_ != NULL) { + xml_element = "testcase"; + test_result = &(current_test_info_->result_); + } else if (current_test_case_ != NULL) { + xml_element = "testsuite"; + test_result = &(current_test_case_->ad_hoc_test_result_); + } else { + xml_element = "testsuites"; + test_result = &ad_hoc_test_result_; + } + test_result->RecordProperty(xml_element, test_property); +} + +#if GTEST_HAS_DEATH_TEST +// Disables event forwarding if the control is currently in a death test +// subprocess. Must not be called before InitGoogleTest. +void UnitTestImpl::SuppressTestEventsIfInSubprocess() { + if (internal_run_death_test_flag_.get() != NULL) + listeners()->SuppressEventForwarding(); +} +#endif // GTEST_HAS_DEATH_TEST + +// Initializes event listeners performing XML output as specified by +// UnitTestOptions. Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureXmlOutput() { + const std::string& output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml") { + listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format != "") { + printf("WARNING: unrecognized output format \"%s\" ignored.\n", + output_format.c_str()); + fflush(stdout); + } +} + +#if GTEST_CAN_STREAM_RESULTS_ +// Initializes event listeners for streaming test results in string form. +// Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureStreamingOutput() { + const std::string& target = GTEST_FLAG(stream_result_to); + if (!target.empty()) { + const size_t pos = target.find(':'); + if (pos != std::string::npos) { + listeners()->Append(new StreamingListener(target.substr(0, pos), + target.substr(pos+1))); + } else { + printf("WARNING: unrecognized streaming target \"%s\" ignored.\n", + target.c_str()); + fflush(stdout); + } + } +} +#endif // GTEST_CAN_STREAM_RESULTS_ + +// Performs initialization dependent upon flag values obtained in +// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to +// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest +// this function is also called from RunAllTests. Since this function can be +// called more than once, it has to be idempotent. +void UnitTestImpl::PostFlagParsingInit() { + // Ensures that this function does not execute more than once. + if (!post_flag_parse_init_performed_) { + post_flag_parse_init_performed_ = true; + +#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_) + // Register to send notifications about key process state changes. + listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_()); +#endif // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_) + +#if GTEST_HAS_DEATH_TEST + InitDeathTestSubprocessControlInfo(); + SuppressTestEventsIfInSubprocess(); +#endif // GTEST_HAS_DEATH_TEST + + // Registers parameterized tests. This makes parameterized tests + // available to the UnitTest reflection API without running + // RUN_ALL_TESTS. + RegisterParameterizedTests(); + + // Configures listeners for XML output. This makes it possible for users + // to shut down the default XML output before invoking RUN_ALL_TESTS. + ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Configures listeners for streaming test results to the specified server. + ConfigureStreamingOutput(); +#endif // GTEST_CAN_STREAM_RESULTS_ + } +} + +// A predicate that checks the name of a TestCase against a known +// value. +// +// This is used for implementation of the UnitTest class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestCaseNameIs is copyable. +class TestCaseNameIs { + public: + // Constructor. + explicit TestCaseNameIs(const std::string& name) + : name_(name) {} + + // Returns true iff the name of test_case matches name_. + bool operator()(const TestCase* test_case) const { + return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0; + } + + private: + std::string name_; +}; + +// Finds and returns a TestCase with the given name. If one doesn't +// exist, creates one and returns it. It's the CALLER'S +// RESPONSIBILITY to ensure that this function is only called WHEN THE +// TESTS ARE NOT SHUFFLED. +// +// Arguments: +// +// test_case_name: name of the test case +// type_param: the name of the test case's type parameter, or NULL if +// this is not a typed or a type-parameterized test case. +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +TestCase* UnitTestImpl::GetTestCase(const char* test_case_name, + const char* type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc) { + // Can we find a TestCase with the given name? + const std::vector::const_iterator test_case = + std::find_if(test_cases_.begin(), test_cases_.end(), + TestCaseNameIs(test_case_name)); + + if (test_case != test_cases_.end()) + return *test_case; + + // No. Let's create one. + TestCase* const new_test_case = + new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc); + + // Is this a death test case? + if (internal::UnitTestOptions::MatchesFilter(test_case_name, + kDeathTestCaseFilter)) { + // Yes. Inserts the test case after the last death test case + // defined so far. This only works when the test cases haven't + // been shuffled. Otherwise we may end up running a death test + // after a non-death test. + ++last_death_test_case_; + test_cases_.insert(test_cases_.begin() + last_death_test_case_, + new_test_case); + } else { + // No. Appends to the end of the list. + test_cases_.push_back(new_test_case); + } + + test_case_indices_.push_back(static_cast(test_case_indices_.size())); + return new_test_case; +} + +// Helpers for setting up / tearing down the given environment. They +// are for use in the ForEach() function. +static void SetUpEnvironment(Environment* env) { env->SetUp(); } +static void TearDownEnvironment(Environment* env) { env->TearDown(); } + +// Runs all tests in this UnitTest object, prints the result, and +// returns true if all tests are successful. If any exception is +// thrown during a test, the test is considered to be failed, but the +// rest of the tests will still be run. +// +// When parameterized tests are enabled, it expands and registers +// parameterized tests first in RegisterParameterizedTests(). +// All other functions called from RunAllTests() may safely assume that +// parameterized tests are ready to be counted and run. +bool UnitTestImpl::RunAllTests() { + // Makes sure InitGoogleTest() was called. + if (!GTestIsInitialized()) { + printf("%s", + "\nThis test program did NOT call ::testing::InitGoogleTest " + "before calling RUN_ALL_TESTS(). Please fix it.\n"); + return false; + } + + // Do not run any test if the --help flag was specified. + if (g_help_flag) + return true; + + // Repeats the call to the post-flag parsing initialization in case the + // user didn't call InitGoogleTest. + PostFlagParsingInit(); + + // Even if sharding is not on, test runners may want to use the + // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding + // protocol. + internal::WriteToShardStatusFileIfNeeded(); + + // True iff we are in a subprocess for running a thread-safe-style + // death test. + bool in_subprocess_for_death_test = false; + +#if GTEST_HAS_DEATH_TEST + in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL); +# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) + if (in_subprocess_for_death_test) { + GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_(); + } +# endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) +#endif // GTEST_HAS_DEATH_TEST + + const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, + in_subprocess_for_death_test); + + // Compares the full test names with the filter to decide which + // tests to run. + const bool has_tests_to_run = FilterTests(should_shard + ? HONOR_SHARDING_PROTOCOL + : IGNORE_SHARDING_PROTOCOL) > 0; + + // Lists the tests and exits if the --gtest_list_tests flag was specified. + if (GTEST_FLAG(list_tests)) { + // This must be called *after* FilterTests() has been called. + ListTestsMatchingFilter(); + return true; + } + + random_seed_ = GTEST_FLAG(shuffle) ? + GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; + + // True iff at least one test has failed. + bool failed = false; + + TestEventListener* repeater = listeners()->repeater(); + + start_timestamp_ = GetTimeInMillis(); + repeater->OnTestProgramStart(*parent_); + + // How many times to repeat the tests? We don't want to repeat them + // when we are inside the subprocess of a death test. + const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); + // Repeats forever if the repeat count is negative. + const bool forever = repeat < 0; + for (int i = 0; forever || i != repeat; i++) { + // We want to preserve failures generated by ad-hoc test + // assertions executed before RUN_ALL_TESTS(). + ClearNonAdHocTestResult(); + + const TimeInMillis start = GetTimeInMillis(); + + // Shuffles test cases and tests if requested. + if (has_tests_to_run && GTEST_FLAG(shuffle)) { + random()->Reseed(random_seed_); + // This should be done before calling OnTestIterationStart(), + // such that a test event listener can see the actual test order + // in the event. + ShuffleTests(); + } + + // Tells the unit test event listeners that the tests are about to start. + repeater->OnTestIterationStart(*parent_, i); + + // Runs each test case if there is at least one test to run. + if (has_tests_to_run) { + // Sets up all environments beforehand. + repeater->OnEnvironmentsSetUpStart(*parent_); + ForEach(environments_, SetUpEnvironment); + repeater->OnEnvironmentsSetUpEnd(*parent_); + + // Runs the tests only if there was no fatal failure during global + // set-up. + if (!Test::HasFatalFailure()) { + for (int test_index = 0; test_index < total_test_case_count(); + test_index++) { + GetMutableTestCase(test_index)->Run(); + } + } + + // Tears down all environments in reverse order afterwards. + repeater->OnEnvironmentsTearDownStart(*parent_); + std::for_each(environments_.rbegin(), environments_.rend(), + TearDownEnvironment); + repeater->OnEnvironmentsTearDownEnd(*parent_); + } + + elapsed_time_ = GetTimeInMillis() - start; + + // Tells the unit test event listener that the tests have just finished. + repeater->OnTestIterationEnd(*parent_, i); + + // Gets the result and clears it. + if (!Passed()) { + failed = true; + } + + // Restores the original test order after the iteration. This + // allows the user to quickly repro a failure that happens in the + // N-th iteration without repeating the first (N - 1) iterations. + // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in + // case the user somehow changes the value of the flag somewhere + // (it's always safe to unshuffle the tests). + UnshuffleTests(); + + if (GTEST_FLAG(shuffle)) { + // Picks a new random seed for each iteration. + random_seed_ = GetNextRandomSeed(random_seed_); + } + } + + repeater->OnTestProgramEnd(*parent_); + + return !failed; +} + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded() { + const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile); + if (test_shard_file != NULL) { + FILE* const file = posix::FOpen(test_shard_file, "w"); + if (file == NULL) { + ColoredPrintf(COLOR_RED, + "Could not write to the test shard status file \"%s\" " + "specified by the %s environment variable.\n", + test_shard_file, kTestShardStatusFile); + fflush(stdout); + exit(EXIT_FAILURE); + } + fclose(file); + } +} + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (i.e., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +bool ShouldShard(const char* total_shards_env, + const char* shard_index_env, + bool in_subprocess_for_death_test) { + if (in_subprocess_for_death_test) { + return false; + } + + const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1); + const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1); + + if (total_shards == -1 && shard_index == -1) { + return false; + } else if (total_shards == -1 && shard_index != -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestShardIndex << " = " << shard_index + << ", but have left " << kTestTotalShards << " unset.\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (total_shards != -1 && shard_index == -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestTotalShards << " = " << total_shards + << ", but have left " << kTestShardIndex << " unset.\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (shard_index < 0 || shard_index >= total_shards) { + const Message msg = Message() + << "Invalid environment variables: we require 0 <= " + << kTestShardIndex << " < " << kTestTotalShards + << ", but you have " << kTestShardIndex << "=" << shard_index + << ", " << kTestTotalShards << "=" << total_shards << ".\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } + + return total_shards > 1; +} + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error +// and aborts. +Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) { + const char* str_val = posix::GetEnv(var); + if (str_val == NULL) { + return default_val; + } + + Int32 result; + if (!ParseInt32(Message() << "The value of environment variable " << var, + str_val, &result)) { + exit(EXIT_FAILURE); + } + return result; +} + +// Given the total number of shards, the shard index, and the test id, +// returns true iff the test should be run on this shard. The test id is +// some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { + return (test_id % total_shards) == shard_index; +} + +// Compares the name of each test with the user-specified filter to +// decide whether the test should be run, then records the result in +// each TestCase and TestInfo object. +// If shard_tests == true, further filters tests based on sharding +// variables in the environment - see +// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide. +// Returns the number of tests that should run. +int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { + const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? + Int32FromEnvOrDie(kTestTotalShards, -1) : -1; + const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? + Int32FromEnvOrDie(kTestShardIndex, -1) : -1; + + // num_runnable_tests are the number of tests that will + // run across all shards (i.e., match filter and are not disabled). + // num_selected_tests are the number of tests to be run on + // this shard. + int num_runnable_tests = 0; + int num_selected_tests = 0; + for (size_t i = 0; i < test_cases_.size(); i++) { + TestCase* const test_case = test_cases_[i]; + const std::string &test_case_name = test_case->name(); + test_case->set_should_run(false); + + for (size_t j = 0; j < test_case->test_info_list().size(); j++) { + TestInfo* const test_info = test_case->test_info_list()[j]; + const std::string test_name(test_info->name()); + // A test is disabled if test case name or test name matches + // kDisableTestFilter. + const bool is_disabled = + internal::UnitTestOptions::MatchesFilter(test_case_name, + kDisableTestFilter) || + internal::UnitTestOptions::MatchesFilter(test_name, + kDisableTestFilter); + test_info->is_disabled_ = is_disabled; + + const bool matches_filter = + internal::UnitTestOptions::FilterMatchesTest(test_case_name, + test_name); + test_info->matches_filter_ = matches_filter; + + const bool is_runnable = + (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && + matches_filter; + + const bool is_selected = is_runnable && + (shard_tests == IGNORE_SHARDING_PROTOCOL || + ShouldRunTestOnShard(total_shards, shard_index, + num_runnable_tests)); + + num_runnable_tests += is_runnable; + num_selected_tests += is_selected; + + test_info->should_run_ = is_selected; + test_case->set_should_run(test_case->should_run() || is_selected); + } + } + return num_selected_tests; +} + +// Prints the given C-string on a single line by replacing all '\n' +// characters with string "\\n". If the output takes more than +// max_length characters, only prints the first max_length characters +// and "...". +static void PrintOnOneLine(const char* str, int max_length) { + if (str != NULL) { + for (int i = 0; *str != '\0'; ++str) { + if (i >= max_length) { + printf("..."); + break; + } + if (*str == '\n') { + printf("\\n"); + i += 2; + } else { + printf("%c", *str); + ++i; + } + } + } +} + +// Prints the names of the tests matching the user-specified filter flag. +void UnitTestImpl::ListTestsMatchingFilter() { + // Print at most this many characters for each type/value parameter. + const int kMaxParamLength = 250; + + for (size_t i = 0; i < test_cases_.size(); i++) { + const TestCase* const test_case = test_cases_[i]; + bool printed_test_case_name = false; + + for (size_t j = 0; j < test_case->test_info_list().size(); j++) { + const TestInfo* const test_info = + test_case->test_info_list()[j]; + if (test_info->matches_filter_) { + if (!printed_test_case_name) { + printed_test_case_name = true; + printf("%s.", test_case->name()); + if (test_case->type_param() != NULL) { + printf(" # %s = ", kTypeParamLabel); + // We print the type parameter on a single line to make + // the output easy to parse by a program. + PrintOnOneLine(test_case->type_param(), kMaxParamLength); + } + printf("\n"); + } + printf(" %s", test_info->name()); + if (test_info->value_param() != NULL) { + printf(" # %s = ", kValueParamLabel); + // We print the value parameter on a single line to make the + // output easy to parse by a program. + PrintOnOneLine(test_info->value_param(), kMaxParamLength); + } + printf("\n"); + } + } + } + fflush(stdout); +} + +// Sets the OS stack trace getter. +// +// Does nothing if the input and the current OS stack trace getter are +// the same; otherwise, deletes the old getter and makes the input the +// current getter. +void UnitTestImpl::set_os_stack_trace_getter( + OsStackTraceGetterInterface* getter) { + if (os_stack_trace_getter_ != getter) { + delete os_stack_trace_getter_; + os_stack_trace_getter_ = getter; + } +} + +// Returns the current OS stack trace getter if it is not NULL; +// otherwise, creates an OsStackTraceGetter, makes it the current +// getter, and returns it. +OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { + if (os_stack_trace_getter_ == NULL) { +#ifdef GTEST_OS_STACK_TRACE_GETTER_ + os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_; +#else + os_stack_trace_getter_ = new OsStackTraceGetter; +#endif // GTEST_OS_STACK_TRACE_GETTER_ + } + + return os_stack_trace_getter_; +} + +// Returns the TestResult for the test that's currently running, or +// the TestResult for the ad hoc test if no test is running. +TestResult* UnitTestImpl::current_test_result() { + return current_test_info_ ? + &(current_test_info_->result_) : &ad_hoc_test_result_; +} + +// Shuffles all test cases, and the tests within each test case, +// making sure that death tests are still run first. +void UnitTestImpl::ShuffleTests() { + // Shuffles the death test cases. + ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_); + + // Shuffles the non-death test cases. + ShuffleRange(random(), last_death_test_case_ + 1, + static_cast(test_cases_.size()), &test_case_indices_); + + // Shuffles the tests inside each test case. + for (size_t i = 0; i < test_cases_.size(); i++) { + test_cases_[i]->ShuffleTests(random()); + } +} + +// Restores the test cases and tests to their order before the first shuffle. +void UnitTestImpl::UnshuffleTests() { + for (size_t i = 0; i < test_cases_.size(); i++) { + // Unshuffles the tests in each test case. + test_cases_[i]->UnshuffleTests(); + // Resets the index of each test case. + test_case_indices_[i] = static_cast(i); + } +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, + int skip_count) { + // We pass skip_count + 1 to skip this wrapper function in addition + // to what the user really wants to skip. + return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); +} + +// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to +// suppress unreachable code warnings. +namespace { +class ClassUniqueToAlwaysTrue {}; +} + +bool IsTrue(bool condition) { return condition; } + +bool AlwaysTrue() { +#if GTEST_HAS_EXCEPTIONS + // This condition is always false so AlwaysTrue() never actually throws, + // but it makes the compiler think that it may throw. + if (IsTrue(false)) + throw ClassUniqueToAlwaysTrue(); +#endif // GTEST_HAS_EXCEPTIONS + return true; +} + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +bool SkipPrefix(const char* prefix, const char** pstr) { + const size_t prefix_len = strlen(prefix); + if (strncmp(*pstr, prefix, prefix_len) == 0) { + *pstr += prefix_len; + return true; + } + return false; +} + +// Parses a string as a command line flag. The string should have +// the format "--flag=value". When def_optional is true, the "=value" +// part can be omitted. +// +// Returns the value of the flag, or NULL if the parsing failed. +const char* ParseFlagValue(const char* str, + const char* flag, + bool def_optional) { + // str and flag must not be NULL. + if (str == NULL || flag == NULL) return NULL; + + // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. + const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag; + const size_t flag_len = flag_str.length(); + if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL; + + // Skips the flag name. + const char* flag_end = str + flag_len; + + // When def_optional is true, it's OK to not have a "=value" part. + if (def_optional && (flag_end[0] == '\0')) { + return flag_end; + } + + // If def_optional is true and there are more characters after the + // flag name, or if def_optional is false, there must be a '=' after + // the flag name. + if (flag_end[0] != '=') return NULL; + + // Returns the string after "=". + return flag_end + 1; +} + +// Parses a string for a bool flag, in the form of either +// "--flag=value" or "--flag". +// +// In the former case, the value is taken as true as long as it does +// not start with '0', 'f', or 'F'. +// +// In the latter case, the value is taken as true. +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseBoolFlag(const char* str, const char* flag, bool* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, true); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Converts the string value to a bool. + *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); + return true; +} + +// Parses a string for an Int32 flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Sets *value to the value of the flag. + return ParseInt32(Message() << "The value of flag --" << flag, + value_str, value); +} + +// Parses a string for a string flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseStringFlag(const char* str, const char* flag, std::string* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Sets *value to the value of the flag. + *value = value_str; + return true; +} + +// Determines whether a string has a prefix that Google Test uses for its +// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_. +// If Google Test detects that a command line flag has its prefix but is not +// recognized, it will print its help message. Flags starting with +// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test +// internal flags and do not trigger the help message. +static bool HasGoogleTestFlagPrefix(const char* str) { + return (SkipPrefix("--", &str) || + SkipPrefix("-", &str) || + SkipPrefix("/", &str)) && + !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && + (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || + SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str)); +} + +// Prints a string containing code-encoded text. The following escape +// sequences can be used in the string to control the text color: +// +// @@ prints a single '@' character. +// @R changes the color to red. +// @G changes the color to green. +// @Y changes the color to yellow. +// @D changes to the default terminal text color. +// +// TODO(wan@google.com): Write tests for this once we add stdout +// capturing to Google Test. +static void PrintColorEncoded(const char* str) { + GTestColor color = COLOR_DEFAULT; // The current color. + + // Conceptually, we split the string into segments divided by escape + // sequences. Then we print one segment at a time. At the end of + // each iteration, the str pointer advances to the beginning of the + // next segment. + for (;;) { + const char* p = strchr(str, '@'); + if (p == NULL) { + ColoredPrintf(color, "%s", str); + return; + } + + ColoredPrintf(color, "%s", std::string(str, p).c_str()); + + const char ch = p[1]; + str = p + 2; + if (ch == '@') { + ColoredPrintf(color, "@"); + } else if (ch == 'D') { + color = COLOR_DEFAULT; + } else if (ch == 'R') { + color = COLOR_RED; + } else if (ch == 'G') { + color = COLOR_GREEN; + } else if (ch == 'Y') { + color = COLOR_YELLOW; + } else { + --str; + } + } +} + +static const char kColorEncodedHelpMessage[] = +"This program contains tests written using " GTEST_NAME_ ". You can use the\n" +"following command line flags to control its behavior:\n" +"\n" +"Test Selection:\n" +" @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n" +" List the names of all tests instead of running them. The name of\n" +" TEST(Foo, Bar) is \"Foo.Bar\".\n" +" @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS" + "[@G-@YNEGATIVE_PATTERNS]@D\n" +" Run only the tests whose name matches one of the positive patterns but\n" +" none of the negative patterns. '?' matches any single character; '*'\n" +" matches any substring; ':' separates two patterns.\n" +" @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n" +" Run all disabled tests too.\n" +"\n" +"Test Execution:\n" +" @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n" +" Run the tests repeatedly; use a negative count to repeat forever.\n" +" @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n" +" Randomize tests' orders on every iteration.\n" +" @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n" +" Random number seed to use for shuffling test orders (between 1 and\n" +" 99999, or 0 to use a seed based on the current time).\n" +"\n" +"Test Output:\n" +" @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" +" Enable/disable colored output. The default is @Gauto@D.\n" +" -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n" +" Don't print the elapsed time of each test.\n" +" @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G" + GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n" +" Generate an XML report in the given directory or with the given file\n" +" name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n" +#if GTEST_CAN_STREAM_RESULTS_ +" @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" +" Stream test results to the given server.\n" +#endif // GTEST_CAN_STREAM_RESULTS_ +"\n" +"Assertion Behavior:\n" +#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +" @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" +" Set the default death test style.\n" +#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +" @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" +" Turn assertion failures into debugger break-points.\n" +" @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n" +" Turn assertion failures into C++ exceptions.\n" +" @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n" +" Do not report exceptions as test failures. Instead, allow them\n" +" to crash the program or throw a pop-up (on Windows).\n" +"\n" +"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set " + "the corresponding\n" +"environment variable of a flag (all letters in upper-case). For example, to\n" +"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_ + "color=no@D or set\n" +"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n" +"\n" +"For more information, please read the " GTEST_NAME_ " documentation at\n" +"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n" +"(not one in your own code or tests), please report it to\n" +"@G<" GTEST_DEV_EMAIL_ ">@D.\n"; + +bool ParseGoogleTestFlag(const char* const arg) { + return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, + >EST_FLAG(also_run_disabled_tests)) || + ParseBoolFlag(arg, kBreakOnFailureFlag, + >EST_FLAG(break_on_failure)) || + ParseBoolFlag(arg, kCatchExceptionsFlag, + >EST_FLAG(catch_exceptions)) || + ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || + ParseStringFlag(arg, kDeathTestStyleFlag, + >EST_FLAG(death_test_style)) || + ParseBoolFlag(arg, kDeathTestUseFork, + >EST_FLAG(death_test_use_fork)) || + ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || + ParseStringFlag(arg, kInternalRunDeathTestFlag, + >EST_FLAG(internal_run_death_test)) || + ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || + ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || + ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || + ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || + ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || + ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || + ParseInt32Flag(arg, kStackTraceDepthFlag, + >EST_FLAG(stack_trace_depth)) || + ParseStringFlag(arg, kStreamResultToFlag, + >EST_FLAG(stream_result_to)) || + ParseBoolFlag(arg, kThrowOnFailureFlag, + >EST_FLAG(throw_on_failure)); +} + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +void LoadFlagsFromFile(const std::string& path) { + FILE* flagfile = posix::FOpen(path.c_str(), "r"); + if (!flagfile) { + fprintf(stderr, + "Unable to open file \"%s\"\n", + GTEST_FLAG(flagfile).c_str()); + fflush(stderr); + exit(EXIT_FAILURE); + } + std::string contents(ReadEntireFile(flagfile)); + posix::FClose(flagfile); + std::vector lines; + SplitString(contents, '\n', &lines); + for (size_t i = 0; i < lines.size(); ++i) { + if (lines[i].empty()) + continue; + if (!ParseGoogleTestFlag(lines[i].c_str())) + g_help_flag = true; + } +} +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. The type parameter CharType can be +// instantiated to either char or wchar_t. +template +void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { + for (int i = 1; i < *argc; i++) { + const std::string arg_string = StreamableToString(argv[i]); + const char* const arg = arg_string.c_str(); + + using internal::ParseBoolFlag; + using internal::ParseInt32Flag; + using internal::ParseStringFlag; + + bool remove_flag = false; + if (ParseGoogleTestFlag(arg)) { + remove_flag = true; +#if GTEST_USE_OWN_FLAGFILE_FLAG_ + } else if (ParseStringFlag(arg, kFlagfileFlag, >EST_FLAG(flagfile))) { + LoadFlagsFromFile(GTEST_FLAG(flagfile)); + remove_flag = true; +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + } else if (arg_string == "--help" || arg_string == "-h" || + arg_string == "-?" || arg_string == "/?" || + HasGoogleTestFlagPrefix(arg)) { + // Both help flag and unrecognized Google Test flags (excluding + // internal ones) trigger help display. + g_help_flag = true; + } + + if (remove_flag) { + // Shift the remainder of the argv list left by one. Note + // that argv has (*argc + 1) elements, the last one always being + // NULL. The following loop moves the trailing NULL element as + // well. + for (int j = i; j != *argc; j++) { + argv[j] = argv[j + 1]; + } + + // Decrements the argument count. + (*argc)--; + + // We also need to decrement the iterator as we just removed + // an element. + i--; + } + } + + if (g_help_flag) { + // We print the help here instead of in RUN_ALL_TESTS(), as the + // latter may not be called at all if the user is using Google + // Test with another testing framework. + PrintColorEncoded(kColorEncodedHelpMessage); + } +} + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +void ParseGoogleTestFlagsOnly(int* argc, char** argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} +void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} + +// The internal implementation of InitGoogleTest(). +// +// The type parameter CharType can be instantiated to either char or +// wchar_t. +template +void InitGoogleTestImpl(int* argc, CharType** argv) { + // We don't want to run the initialization code twice. + if (GTestIsInitialized()) return; + + if (*argc <= 0) return; + + g_argvs.clear(); + for (int i = 0; i != *argc; i++) { + g_argvs.push_back(StreamableToString(argv[i])); + } + + ParseGoogleTestFlagsOnly(argc, argv); + GetUnitTestImpl()->PostFlagParsingInit(); +} + +} // namespace internal + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +void InitGoogleTest(int* argc, char** argv) { +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +void InitGoogleTest(int* argc, wchar_t** argv) { +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +} // namespace testing diff --git a/libs/libvpx/third_party/libwebm/README.libvpx b/libs/libvpx/third_party/libwebm/README.libvpx index 73f8303222..ebb5ff2f4d 100644 --- a/libs/libvpx/third_party/libwebm/README.libvpx +++ b/libs/libvpx/third_party/libwebm/README.libvpx @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 32d5ac49414a8914ec1e1f285f3f927c6e8ec29d +Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74 License: BSD License File: LICENSE.txt diff --git a/libs/libvpx/third_party/libwebm/common/file_util.cc b/libs/libvpx/third_party/libwebm/common/file_util.cc index 4f91318f3e..6dab146dd9 100644 --- a/libs/libvpx/third_party/libwebm/common/file_util.cc +++ b/libs/libvpx/third_party/libwebm/common/file_util.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -21,13 +22,23 @@ namespace libwebm { std::string GetTempFileName() { #if !defined _MSC_VER && !defined __MINGW32__ - char temp_file_name_template[] = "libwebm_temp.XXXXXX"; + std::string temp_file_name_template_str = + std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") : + ".") + + "/libwebm_temp.XXXXXX"; + char* temp_file_name_template = + new char[temp_file_name_template_str.length() + 1]; + memset(temp_file_name_template, 0, temp_file_name_template_str.length() + 1); + temp_file_name_template_str.copy(temp_file_name_template, + temp_file_name_template_str.length(), 0); int fd = mkstemp(temp_file_name_template); + std::string temp_file_name = + (fd != -1) ? std::string(temp_file_name_template) : std::string(); + delete[] temp_file_name_template; if (fd != -1) { close(fd); - return std::string(temp_file_name_template); } - return std::string(); + return temp_file_name; #else char tmp_file_name[_MAX_PATH]; errno_t err = tmpnam_s(tmp_file_name); diff --git a/libs/libvpx/third_party/libwebm/common/hdr_util.cc b/libs/libvpx/third_party/libwebm/common/hdr_util.cc index e1a9842fb6..e1618ce75a 100644 --- a/libs/libvpx/third_party/libwebm/common/hdr_util.cc +++ b/libs/libvpx/third_party/libwebm/common/hdr_util.cc @@ -7,12 +7,15 @@ // be found in the AUTHORS file in the root of the source tree. #include "hdr_util.h" +#include #include #include #include "mkvparser/mkvparser.h" namespace libwebm { +const int Vp9CodecFeatures::kValueNotPresent = INT_MAX; + bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, PrimaryChromaticityPtr* muxer_pc) { muxer_pc->reset(new (std::nothrow) @@ -29,9 +32,9 @@ bool MasteringMetadataValuePresent(double value) { bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm, mkvmuxer::MasteringMetadata* muxer_mm) { if (MasteringMetadataValuePresent(parser_mm.luminance_max)) - muxer_mm->luminance_max = parser_mm.luminance_max; + muxer_mm->set_luminance_max(parser_mm.luminance_max); if (MasteringMetadataValuePresent(parser_mm.luminance_min)) - muxer_mm->luminance_min = parser_mm.luminance_min; + muxer_mm->set_luminance_min(parser_mm.luminance_min); PrimaryChromaticityPtr r_ptr(NULL); PrimaryChromaticityPtr g_ptr(NULL); @@ -73,34 +76,37 @@ bool CopyColour(const mkvparser::Colour& parser_colour, return false; if (ColourValuePresent(parser_colour.matrix_coefficients)) - muxer_colour->matrix_coefficients = parser_colour.matrix_coefficients; + muxer_colour->set_matrix_coefficients(parser_colour.matrix_coefficients); if (ColourValuePresent(parser_colour.bits_per_channel)) - muxer_colour->bits_per_channel = parser_colour.bits_per_channel; - if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) - muxer_colour->chroma_subsampling_horz = - parser_colour.chroma_subsampling_horz; - if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) - muxer_colour->chroma_subsampling_vert = - parser_colour.chroma_subsampling_vert; + muxer_colour->set_bits_per_channel(parser_colour.bits_per_channel); + if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) { + muxer_colour->set_chroma_subsampling_horz( + parser_colour.chroma_subsampling_horz); + } + if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) { + muxer_colour->set_chroma_subsampling_vert( + parser_colour.chroma_subsampling_vert); + } if (ColourValuePresent(parser_colour.cb_subsampling_horz)) - muxer_colour->cb_subsampling_horz = parser_colour.cb_subsampling_horz; + muxer_colour->set_cb_subsampling_horz(parser_colour.cb_subsampling_horz); if (ColourValuePresent(parser_colour.cb_subsampling_vert)) - muxer_colour->cb_subsampling_vert = parser_colour.cb_subsampling_vert; + muxer_colour->set_cb_subsampling_vert(parser_colour.cb_subsampling_vert); if (ColourValuePresent(parser_colour.chroma_siting_horz)) - muxer_colour->chroma_siting_horz = parser_colour.chroma_siting_horz; + muxer_colour->set_chroma_siting_horz(parser_colour.chroma_siting_horz); if (ColourValuePresent(parser_colour.chroma_siting_vert)) - muxer_colour->chroma_siting_vert = parser_colour.chroma_siting_vert; + muxer_colour->set_chroma_siting_vert(parser_colour.chroma_siting_vert); if (ColourValuePresent(parser_colour.range)) - muxer_colour->range = parser_colour.range; - if (ColourValuePresent(parser_colour.transfer_characteristics)) - muxer_colour->transfer_characteristics = - parser_colour.transfer_characteristics; + muxer_colour->set_range(parser_colour.range); + if (ColourValuePresent(parser_colour.transfer_characteristics)) { + muxer_colour->set_transfer_characteristics( + parser_colour.transfer_characteristics); + } if (ColourValuePresent(parser_colour.primaries)) - muxer_colour->primaries = parser_colour.primaries; + muxer_colour->set_primaries(parser_colour.primaries); if (ColourValuePresent(parser_colour.max_cll)) - muxer_colour->max_cll = parser_colour.max_cll; + muxer_colour->set_max_cll(parser_colour.max_cll); if (ColourValuePresent(parser_colour.max_fall)) - muxer_colour->max_fall = parser_colour.max_fall; + muxer_colour->set_max_fall(parser_colour.max_fall); if (parser_colour.mastering_metadata) { mkvmuxer::MasteringMetadata muxer_mm; @@ -116,8 +122,8 @@ bool CopyColour(const mkvparser::Colour& parser_colour, // // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// | ID Byte | Length | | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +// | ID Byte | Length | | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | // | | // : Bytes 1..Length of Codec Feature : // | | @@ -132,51 +138,83 @@ bool CopyColour(const mkvparser::Colour& parser_colour, // // The X bit is reserved. // -// Currently only profile level is supported. ID byte must be set to 1, and -// length must be 1. Supported values are: -// -// 10: Level 1 -// 11: Level 1.1 -// 20: Level 2 -// 21: Level 2.1 -// 30: Level 3 -// 31: Level 3.1 -// 40: Level 4 -// 41: Level 4.1 -// 50: Level 5 -// 51: Level 5.1 -// 52: Level 5.2 -// 60: Level 6 -// 61: Level 6.1 -// 62: Level 6.2 -// // See the following link for more information: // http://www.webmproject.org/vp9/profiles/ -int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length) { - const int kVpxCodecPrivateLength = 3; - if (!private_data || length != kVpxCodecPrivateLength) - return 0; +bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length, + Vp9CodecFeatures* features) { + const int kVpxCodecPrivateMinLength = 3; + if (!private_data || !features || length < kVpxCodecPrivateMinLength) + return false; - const uint8_t id_byte = *private_data; - if (id_byte != 1) - return 0; + const uint8_t kVp9ProfileId = 1; + const uint8_t kVp9LevelId = 2; + const uint8_t kVp9BitDepthId = 3; + const uint8_t kVp9ChromaSubsamplingId = 4; + const int kVpxFeatureLength = 1; + int offset = 0; - const int kVpxProfileLength = 1; - const uint8_t length_byte = private_data[1]; - if (length_byte != kVpxProfileLength) - return 0; + // Set features to not set. + features->profile = Vp9CodecFeatures::kValueNotPresent; + features->level = Vp9CodecFeatures::kValueNotPresent; + features->bit_depth = Vp9CodecFeatures::kValueNotPresent; + features->chroma_subsampling = Vp9CodecFeatures::kValueNotPresent; + do { + const uint8_t id_byte = private_data[offset++]; + const uint8_t length_byte = private_data[offset++]; + if (length_byte != kVpxFeatureLength) + return false; + if (id_byte == kVp9ProfileId) { + const int priv_profile = static_cast(private_data[offset++]); + if (priv_profile < 0 || priv_profile > 3) + return false; + if (features->profile != Vp9CodecFeatures::kValueNotPresent && + features->profile != priv_profile) { + return false; + } + features->profile = priv_profile; + } else if (id_byte == kVp9LevelId) { + const int priv_level = static_cast(private_data[offset++]); - const int level = static_cast(private_data[2]); + const int kNumLevels = 14; + const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40, + 41, 50, 51, 52, 60, 61, 62}; - const int kNumLevels = 14; - const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40, - 41, 50, 51, 52, 60, 61, 62}; + for (int i = 0; i < kNumLevels; ++i) { + if (priv_level == levels[i]) { + if (features->level != Vp9CodecFeatures::kValueNotPresent && + features->level != priv_level) { + return false; + } + features->level = priv_level; + break; + } + } + if (features->level == Vp9CodecFeatures::kValueNotPresent) + return false; + } else if (id_byte == kVp9BitDepthId) { + const int priv_profile = static_cast(private_data[offset++]); + if (priv_profile != 8 && priv_profile != 10 && priv_profile != 12) + return false; + if (features->bit_depth != Vp9CodecFeatures::kValueNotPresent && + features->bit_depth != priv_profile) { + return false; + } + features->bit_depth = priv_profile; + } else if (id_byte == kVp9ChromaSubsamplingId) { + const int priv_profile = static_cast(private_data[offset++]); + if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3) + return false; + if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent && + features->chroma_subsampling != priv_profile) { + return false; + } + features->chroma_subsampling = priv_profile; + } else { + // Invalid ID. + return false; + } + } while (offset + kVpxCodecPrivateMinLength <= length); - for (int i = 0; i < kNumLevels; ++i) { - if (level == levels[i]) - return level; - } - - return 0; + return true; } } // namespace libwebm diff --git a/libs/libvpx/third_party/libwebm/common/hdr_util.h b/libs/libvpx/third_party/libwebm/common/hdr_util.h index d30c2b9f2a..3ef5388fd0 100644 --- a/libs/libvpx/third_party/libwebm/common/hdr_util.h +++ b/libs/libvpx/third_party/libwebm/common/hdr_util.h @@ -28,7 +28,34 @@ namespace libwebm { // TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is // required by libwebm. +// Features of the VP9 codec that may be set in the CodecPrivate of a VP9 video +// stream. A value of kValueNotPresent represents that the value was not set in +// the CodecPrivate. +struct Vp9CodecFeatures { + static const int kValueNotPresent; + + Vp9CodecFeatures() + : profile(kValueNotPresent), + level(kValueNotPresent), + bit_depth(kValueNotPresent), + chroma_subsampling(kValueNotPresent) {} + ~Vp9CodecFeatures() {} + + int profile; + int level; + int bit_depth; + int chroma_subsampling; +}; + +// disable deprecation warnings for auto_ptr +#if defined(__GNUC__) && __GNUC__ >= 5 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif typedef std::auto_ptr PrimaryChromaticityPtr; +#if defined(__GNUC__) && __GNUC__ >= 5 +#pragma GCC diagnostic pop +#endif bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, PrimaryChromaticityPtr* muxer_pc); @@ -43,8 +70,9 @@ bool ColourValuePresent(long long value); bool CopyColour(const mkvparser::Colour& parser_colour, mkvmuxer::Colour* muxer_colour); -// Returns VP9 profile upon success or 0 upon failure. -int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length); +// Returns true if |features| is set to one or more valid values. +bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length, + Vp9CodecFeatures* features); } // namespace libwebm diff --git a/libs/libvpx/third_party/libwebm/common/webmids.h b/libs/libvpx/third_party/libwebm/common/webmids.h index 32a0c5fb91..89d722a71b 100644 --- a/libs/libvpx/third_party/libwebm/common/webmids.h +++ b/libs/libvpx/third_party/libwebm/common/webmids.h @@ -124,6 +124,14 @@ enum MkvId { kMkvLuminanceMin = 0x55DA, // end mastering metadata // end colour + // projection + kMkvProjection = 0x7670, + kMkvProjectionType = 0x7671, + kMkvProjectionPrivate = 0x7672, + kMkvProjectionPoseYaw = 0x7673, + kMkvProjectionPosePitch = 0x7674, + kMkvProjectionPoseRoll = 0x7675, + // end projection // audio kMkvAudio = 0xE1, kMkvSamplingFrequency = 0xB5, diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc index c79ce24ed3..15b9a908d8 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "common/webmids.h" @@ -23,12 +24,26 @@ #include "mkvmuxer/mkvwriter.h" #include "mkvparser/mkvparser.h" +// disable deprecation warnings for auto_ptr +#if defined(__GNUC__) && __GNUC__ >= 5 +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + namespace mkvmuxer { +const float PrimaryChromaticity::kChromaticityMin = 0.0f; +const float PrimaryChromaticity::kChromaticityMax = 1.0f; +const float MasteringMetadata::kMinLuminance = 0.0f; +const float MasteringMetadata::kMinLuminanceMax = 999.99f; +const float MasteringMetadata::kMaxLuminanceMax = 9999.99f; const float MasteringMetadata::kValueNotPresent = FLT_MAX; const uint64_t Colour::kValueNotPresent = UINT64_MAX; namespace { + +const char kDocTypeWebm[] = "webm"; +const char kDocTypeMatroska[] = "matroska"; + // Deallocate the string designated by |dst|, and then copy the |src| // string to |dst|. The caller owns both the |src| string and the // |dst| copy (hence the caller is responsible for eventually @@ -63,7 +78,7 @@ bool CopyChromaticity(const PrimaryChromaticity* src, if (!dst) return false; - dst->reset(new (std::nothrow) PrimaryChromaticity(src->x, src->y)); + dst->reset(new (std::nothrow) PrimaryChromaticity(src->x(), src->y())); if (!dst->get()) return false; @@ -80,36 +95,57 @@ IMkvWriter::IMkvWriter() {} IMkvWriter::~IMkvWriter() {} -bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) { +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version, + const char* const doc_type) { // Level 0 - uint64_t size = EbmlElementSize(libwebm::kMkvEBMLVersion, UINT64_C(1)); - size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, UINT64_C(1)); - size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, UINT64_C(4)); - size += EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8)); - size += EbmlElementSize(libwebm::kMkvDocType, "webm"); - size += EbmlElementSize(libwebm::kMkvDocTypeVersion, doc_type_version); - size += EbmlElementSize(libwebm::kMkvDocTypeReadVersion, UINT64_C(2)); + uint64_t size = + EbmlElementSize(libwebm::kMkvEBMLVersion, static_cast(1)); + size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, static_cast(1)); + size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, static_cast(4)); + size += + EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, static_cast(8)); + size += EbmlElementSize(libwebm::kMkvDocType, doc_type); + size += EbmlElementSize(libwebm::kMkvDocTypeVersion, + static_cast(doc_type_version)); + size += + EbmlElementSize(libwebm::kMkvDocTypeReadVersion, static_cast(2)); if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size)) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion, UINT64_C(1))) + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion, + static_cast(1))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion, UINT64_C(1))) + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion, + static_cast(1))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength, UINT64_C(4))) + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength, + static_cast(4))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8))) + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength, + static_cast(8))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvDocType, "webm")) + } + if (!WriteEbmlElement(writer, libwebm::kMkvDocType, doc_type)) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion, doc_type_version)) + if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion, + static_cast(doc_type_version))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion, UINT64_C(2))) + } + if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion, + static_cast(2))) { return false; + } return true; } +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) { + return WriteEbmlHeader(writer, doc_type_version, kDocTypeWebm); +} + bool WriteEbmlHeader(IMkvWriter* writer) { return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion); } @@ -262,15 +298,17 @@ bool CuePoint::Write(IMkvWriter* writer) const { if (!writer || track_ < 1 || cluster_pos_ < 1) return false; - uint64_t size = - EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_); - size += EbmlElementSize(libwebm::kMkvCueTrack, track_); + uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition, + static_cast(cluster_pos_)); + size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast(track_)); if (output_block_number_ && block_number_ > 1) - size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_); + size += EbmlElementSize(libwebm::kMkvCueBlockNumber, + static_cast(block_number_)); const uint64_t track_pos_size = EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size; const uint64_t payload_size = - EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size; + EbmlElementSize(libwebm::kMkvCueTime, static_cast(time_)) + + track_pos_size; if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size)) return false; @@ -279,18 +317,27 @@ bool CuePoint::Write(IMkvWriter* writer) const { if (payload_position < 0) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvCueTime, time_)) + if (!WriteEbmlElement(writer, libwebm::kMkvCueTime, + static_cast(time_))) { return false; + } if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size)) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack, track_)) + if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack, + static_cast(track_))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition, cluster_pos_)) + } + if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition, + static_cast(cluster_pos_))) { return false; - if (output_block_number_ && block_number_ > 1) - if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber, block_number_)) + } + if (output_block_number_ && block_number_ > 1) { + if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber, + static_cast(block_number_))) { return false; + } + } const int64_t stop_position = writer->Position(); if (stop_position < 0) @@ -303,15 +350,17 @@ bool CuePoint::Write(IMkvWriter* writer) const { } uint64_t CuePoint::PayloadSize() const { - uint64_t size = - EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_); - size += EbmlElementSize(libwebm::kMkvCueTrack, track_); + uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition, + static_cast(cluster_pos_)); + size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast(track_)); if (output_block_number_ && block_number_ > 1) - size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_); + size += EbmlElementSize(libwebm::kMkvCueBlockNumber, + static_cast(block_number_)); const uint64_t track_pos_size = EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size; const uint64_t payload_size = - EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size; + EbmlElementSize(libwebm::kMkvCueTime, static_cast(time_)) + + track_pos_size; return payload_size; } @@ -456,8 +505,9 @@ bool ContentEncAESSettings::Write(IMkvWriter* writer) const { return false; if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode, - cipher_mode_)) + static_cast(cipher_mode_))) { return false; + } const int64_t stop_position = writer->Position(); if (stop_position < 0 || @@ -468,8 +518,8 @@ bool ContentEncAESSettings::Write(IMkvWriter* writer) const { } uint64_t ContentEncAESSettings::PayloadSize() const { - uint64_t size = - EbmlElementSize(libwebm::kMkvAESSettingsCipherMode, cipher_mode_); + uint64_t size = EbmlElementSize(libwebm::kMkvAESSettingsCipherMode, + static_cast(cipher_mode_)); return size; } @@ -529,20 +579,22 @@ bool ContentEncoding::Write(IMkvWriter* writer) const { encoding_size)) return false; if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder, - encoding_order_)) + static_cast(encoding_order_))) return false; if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope, - encoding_scope_)) + static_cast(encoding_scope_))) return false; if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType, - encoding_type_)) + static_cast(encoding_type_))) return false; if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption, encryption_size)) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo, enc_algo_)) + if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo, + static_cast(enc_algo_))) { return false; + } if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_, enc_key_id_length_)) return false; @@ -571,12 +623,12 @@ uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size, EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) + encryption_size; } - encoding_size += - EbmlElementSize(libwebm::kMkvContentEncodingType, encoding_type_); - encoding_size += - EbmlElementSize(libwebm::kMkvContentEncodingScope, encoding_scope_); - encoding_size += - EbmlElementSize(libwebm::kMkvContentEncodingOrder, encoding_order_); + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingType, + static_cast(encoding_type_)); + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingScope, + static_cast(encoding_scope_)); + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingOrder, + static_cast(encoding_order_)); return encoding_size; } @@ -586,7 +638,8 @@ uint64_t ContentEncoding::EncryptionSize() const { uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID, enc_key_id_, enc_key_id_length_); - encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo, enc_algo_); + encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo, + static_cast(enc_algo_)); return encryption_size + aes_size; } @@ -664,9 +717,10 @@ ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const { } uint64_t Track::PayloadSize() const { - uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_); - size += EbmlElementSize(libwebm::kMkvTrackUID, uid_); - size += EbmlElementSize(libwebm::kMkvTrackType, type_); + uint64_t size = + EbmlElementSize(libwebm::kMkvTrackNumber, static_cast(number_)); + size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast(uid_)); + size += EbmlElementSize(libwebm::kMkvTrackType, static_cast(type_)); if (codec_id_) size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_); if (codec_private_) @@ -676,15 +730,22 @@ uint64_t Track::PayloadSize() const { size += EbmlElementSize(libwebm::kMkvLanguage, language_); if (name_) size += EbmlElementSize(libwebm::kMkvName, name_); - if (max_block_additional_id_) + if (max_block_additional_id_) { size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID, - max_block_additional_id_); - if (codec_delay_) - size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_); - if (seek_pre_roll_) - size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_); - if (default_duration_) - size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_); + static_cast(max_block_additional_id_)); + } + if (codec_delay_) { + size += EbmlElementSize(libwebm::kMkvCodecDelay, + static_cast(codec_delay_)); + } + if (seek_pre_roll_) { + size += EbmlElementSize(libwebm::kMkvSeekPreRoll, + static_cast(seek_pre_roll_)); + } + if (default_duration_) { + size += EbmlElementSize(libwebm::kMkvDefaultDuration, + static_cast(default_duration_)); + } if (content_encoding_entries_size_ > 0) { uint64_t content_encodings_size = 0; @@ -722,55 +783,64 @@ bool Track::Write(IMkvWriter* writer) const { if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size)) return false; - uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_); - size += EbmlElementSize(libwebm::kMkvTrackUID, uid_); - size += EbmlElementSize(libwebm::kMkvTrackType, type_); + uint64_t size = + EbmlElementSize(libwebm::kMkvTrackNumber, static_cast(number_)); + size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast(uid_)); + size += EbmlElementSize(libwebm::kMkvTrackType, static_cast(type_)); if (codec_id_) size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_); if (codec_private_) size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_, - codec_private_length_); + static_cast(codec_private_length_)); if (language_) size += EbmlElementSize(libwebm::kMkvLanguage, language_); if (name_) size += EbmlElementSize(libwebm::kMkvName, name_); if (max_block_additional_id_) size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID, - max_block_additional_id_); + static_cast(max_block_additional_id_)); if (codec_delay_) - size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_); + size += EbmlElementSize(libwebm::kMkvCodecDelay, + static_cast(codec_delay_)); if (seek_pre_roll_) - size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_); + size += EbmlElementSize(libwebm::kMkvSeekPreRoll, + static_cast(seek_pre_roll_)); if (default_duration_) - size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_); + size += EbmlElementSize(libwebm::kMkvDefaultDuration, + static_cast(default_duration_)); const int64_t payload_position = writer->Position(); if (payload_position < 0) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber, number_)) + if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber, + static_cast(number_))) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID, uid_)) + if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID, + static_cast(uid_))) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvTrackType, type_)) + if (!WriteEbmlElement(writer, libwebm::kMkvTrackType, + static_cast(type_))) return false; if (max_block_additional_id_) { if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID, - max_block_additional_id_)) { + static_cast(max_block_additional_id_))) { return false; } } if (codec_delay_) { - if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay, codec_delay_)) + if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay, + static_cast(codec_delay_))) return false; } if (seek_pre_roll_) { - if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll, seek_pre_roll_)) + if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll, + static_cast(seek_pre_roll_))) return false; } if (default_duration_) { if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration, - default_duration_)) + static_cast(default_duration_))) return false; } if (codec_id_) { @@ -779,7 +849,7 @@ bool Track::Write(IMkvWriter* writer) const { } if (codec_private_) { if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_, - codec_private_length_)) + static_cast(codec_private_length_))) return false; } if (language_) { @@ -890,14 +960,23 @@ void Track::set_name(const char* name) { // // Colour and its child elements -uint64_t PrimaryChromaticity::PrimaryChromaticityPayloadSize( +uint64_t PrimaryChromaticity::PrimaryChromaticitySize( libwebm::MkvId x_id, libwebm::MkvId y_id) const { - return EbmlElementSize(x_id, x) + EbmlElementSize(y_id, y); + return EbmlElementSize(x_id, x_) + EbmlElementSize(y_id, y_); } bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id, libwebm::MkvId y_id) const { - return WriteEbmlElement(writer, x_id, x) && WriteEbmlElement(writer, y_id, y); + if (!Valid()) { + return false; + } + return WriteEbmlElement(writer, x_id, x_) && + WriteEbmlElement(writer, y_id, y_); +} + +bool PrimaryChromaticity::Valid() const { + return (x_ >= kChromaticityMin && x_ <= kChromaticityMax && + y_ >= kChromaticityMin && y_ <= kChromaticityMax); } uint64_t MasteringMetadata::MasteringMetadataSize() const { @@ -909,6 +988,31 @@ uint64_t MasteringMetadata::MasteringMetadataSize() const { return size; } +bool MasteringMetadata::Valid() const { + if (luminance_min_ != kValueNotPresent) { + if (luminance_min_ < kMinLuminance || luminance_min_ > kMinLuminanceMax || + luminance_min_ > luminance_max_) { + return false; + } + } + if (luminance_max_ != kValueNotPresent) { + if (luminance_max_ < kMinLuminance || luminance_max_ > kMaxLuminanceMax || + luminance_max_ < luminance_min_) { + return false; + } + } + if (r_ && !r_->Valid()) + return false; + if (g_ && !g_->Valid()) + return false; + if (b_ && !b_->Valid()) + return false; + if (white_point_ && !white_point_->Valid()) + return false; + + return true; +} + bool MasteringMetadata::Write(IMkvWriter* writer) const { const uint64_t size = PayloadSize(); @@ -918,12 +1022,12 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const { if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size)) return false; - if (luminance_max != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max)) { + if (luminance_max_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max_)) { return false; } - if (luminance_min != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min)) { + if (luminance_min_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) { return false; } if (r_ && @@ -984,25 +1088,25 @@ bool MasteringMetadata::SetChromaticity( uint64_t MasteringMetadata::PayloadSize() const { uint64_t size = 0; - if (luminance_max != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max); - if (luminance_min != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min); + if (luminance_max_ != kValueNotPresent) + size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max_); + if (luminance_min_ != kValueNotPresent) + size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min_); if (r_) { - size += r_->PrimaryChromaticityPayloadSize( - libwebm::kMkvPrimaryRChromaticityX, libwebm::kMkvPrimaryRChromaticityY); + size += r_->PrimaryChromaticitySize(libwebm::kMkvPrimaryRChromaticityX, + libwebm::kMkvPrimaryRChromaticityY); } if (g_) { - size += g_->PrimaryChromaticityPayloadSize( - libwebm::kMkvPrimaryGChromaticityX, libwebm::kMkvPrimaryGChromaticityY); + size += g_->PrimaryChromaticitySize(libwebm::kMkvPrimaryGChromaticityX, + libwebm::kMkvPrimaryGChromaticityY); } if (b_) { - size += b_->PrimaryChromaticityPayloadSize( - libwebm::kMkvPrimaryBChromaticityX, libwebm::kMkvPrimaryBChromaticityY); + size += b_->PrimaryChromaticitySize(libwebm::kMkvPrimaryBChromaticityX, + libwebm::kMkvPrimaryBChromaticityY); } if (white_point_) { - size += white_point_->PrimaryChromaticityPayloadSize( + size += white_point_->PrimaryChromaticitySize( libwebm::kMkvWhitePointChromaticityX, libwebm::kMkvWhitePointChromaticityY); } @@ -1019,6 +1123,33 @@ uint64_t Colour::ColourSize() const { return size; } +bool Colour::Valid() const { + if (mastering_metadata_ && !mastering_metadata_->Valid()) + return false; + if (matrix_coefficients_ != kValueNotPresent && + !IsMatrixCoefficientsValueValid(matrix_coefficients_)) { + return false; + } + if (chroma_siting_horz_ != kValueNotPresent && + !IsChromaSitingHorzValueValid(chroma_siting_horz_)) { + return false; + } + if (chroma_siting_vert_ != kValueNotPresent && + !IsChromaSitingVertValueValid(chroma_siting_vert_)) { + return false; + } + if (range_ != kValueNotPresent && !IsColourRangeValueValid(range_)) + return false; + if (transfer_characteristics_ != kValueNotPresent && + !IsTransferCharacteristicsValueValid(transfer_characteristics_)) { + return false; + } + if (primaries_ != kValueNotPresent && !IsPrimariesValueValid(primaries_)) + return false; + + return true; +} + bool Colour::Write(IMkvWriter* writer) const { const uint64_t size = PayloadSize(); @@ -1026,69 +1157,77 @@ bool Colour::Write(IMkvWriter* writer) const { if (size == 0) return true; + // Don't write an invalid element. + if (!Valid()) + return false; + if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size)) return false; - if (matrix_coefficients != kValueNotPresent && + if (matrix_coefficients_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients, - matrix_coefficients)) { + static_cast(matrix_coefficients_))) { return false; } - if (bits_per_channel != kValueNotPresent && + if (bits_per_channel_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel, - bits_per_channel)) { + static_cast(bits_per_channel_))) { return false; } - if (chroma_subsampling_horz != kValueNotPresent && + if (chroma_subsampling_horz_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz, - chroma_subsampling_horz)) { + static_cast(chroma_subsampling_horz_))) { return false; } - if (chroma_subsampling_vert != kValueNotPresent && + if (chroma_subsampling_vert_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert, - chroma_subsampling_vert)) { + static_cast(chroma_subsampling_vert_))) { return false; } - if (cb_subsampling_horz != kValueNotPresent && + if (cb_subsampling_horz_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz, - cb_subsampling_horz)) { + static_cast(cb_subsampling_horz_))) { return false; } - if (cb_subsampling_vert != kValueNotPresent && + if (cb_subsampling_vert_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert, - cb_subsampling_vert)) { + static_cast(cb_subsampling_vert_))) { return false; } - if (chroma_siting_horz != kValueNotPresent && + if (chroma_siting_horz_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz, - chroma_siting_horz)) { + static_cast(chroma_siting_horz_))) { return false; } - if (chroma_siting_vert != kValueNotPresent && + if (chroma_siting_vert_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert, - chroma_siting_vert)) { + static_cast(chroma_siting_vert_))) { return false; } - if (range != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvRange, range)) { + if (range_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvRange, + static_cast(range_))) { return false; } - if (transfer_characteristics != kValueNotPresent && + if (transfer_characteristics_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics, - transfer_characteristics)) { + static_cast(transfer_characteristics_))) { return false; } - if (primaries != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvPrimaries, primaries)) { + if (primaries_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvPrimaries, + static_cast(primaries_))) { return false; } - if (max_cll != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvMaxCLL, max_cll)) { + if (max_cll_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvMaxCLL, + static_cast(max_cll_))) { return false; } - if (max_fall != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvMaxFALL, max_fall)) { + if (max_fall_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvMaxFALL, + static_cast(max_fall_))) { return false; } @@ -1103,8 +1242,8 @@ bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) { if (!mm_ptr.get()) return false; - mm_ptr->luminance_max = mastering_metadata.luminance_max; - mm_ptr->luminance_min = mastering_metadata.luminance_min; + mm_ptr->set_luminance_max(mastering_metadata.luminance_max()); + mm_ptr->set_luminance_min(mastering_metadata.luminance_min()); if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(), mastering_metadata.b(), @@ -1120,38 +1259,56 @@ bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) { uint64_t Colour::PayloadSize() const { uint64_t size = 0; - if (matrix_coefficients != kValueNotPresent) - size += - EbmlElementSize(libwebm::kMkvMatrixCoefficients, matrix_coefficients); - if (bits_per_channel != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvBitsPerChannel, bits_per_channel); - if (chroma_subsampling_horz != kValueNotPresent) + if (matrix_coefficients_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvMatrixCoefficients, + static_cast(matrix_coefficients_)); + } + if (bits_per_channel_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvBitsPerChannel, + static_cast(bits_per_channel_)); + } + if (chroma_subsampling_horz_ != kValueNotPresent) { size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz, - chroma_subsampling_horz); - if (chroma_subsampling_vert != kValueNotPresent) + static_cast(chroma_subsampling_horz_)); + } + if (chroma_subsampling_vert_ != kValueNotPresent) { size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert, - chroma_subsampling_vert); - if (cb_subsampling_horz != kValueNotPresent) - size += - EbmlElementSize(libwebm::kMkvCbSubsamplingHorz, cb_subsampling_horz); - if (cb_subsampling_vert != kValueNotPresent) - size += - EbmlElementSize(libwebm::kMkvCbSubsamplingVert, cb_subsampling_vert); - if (chroma_siting_horz != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvChromaSitingHorz, chroma_siting_horz); - if (chroma_siting_vert != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvChromaSitingVert, chroma_siting_vert); - if (range != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvRange, range); - if (transfer_characteristics != kValueNotPresent) + static_cast(chroma_subsampling_vert_)); + } + if (cb_subsampling_horz_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvCbSubsamplingHorz, + static_cast(cb_subsampling_horz_)); + } + if (cb_subsampling_vert_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvCbSubsamplingVert, + static_cast(cb_subsampling_vert_)); + } + if (chroma_siting_horz_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvChromaSitingHorz, + static_cast(chroma_siting_horz_)); + } + if (chroma_siting_vert_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvChromaSitingVert, + static_cast(chroma_siting_vert_)); + } + if (range_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvRange, static_cast(range_)); + } + if (transfer_characteristics_ != kValueNotPresent) { size += EbmlElementSize(libwebm::kMkvTransferCharacteristics, - transfer_characteristics); - if (primaries != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvPrimaries, primaries); - if (max_cll != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvMaxCLL, max_cll); - if (max_fall != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvMaxFALL, max_fall); + static_cast(transfer_characteristics_)); + } + if (primaries_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvPrimaries, + static_cast(primaries_)); + } + if (max_cll_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvMaxCLL, static_cast(max_cll_)); + } + if (max_fall_ != kValueNotPresent) { + size += + EbmlElementSize(libwebm::kMkvMaxFALL, static_cast(max_fall_)); + } if (mastering_metadata_) size += mastering_metadata_->MasteringMetadataSize(); @@ -1159,6 +1316,95 @@ uint64_t Colour::PayloadSize() const { return size; } +/////////////////////////////////////////////////////////////// +// +// Projection element + +uint64_t Projection::ProjectionSize() const { + uint64_t size = PayloadSize(); + + if (size > 0) + size += EbmlMasterElementSize(libwebm::kMkvProjection, size); + + return size; +} + +bool Projection::Write(IMkvWriter* writer) const { + const uint64_t size = PayloadSize(); + + // Don't write an empty element. + if (size == 0) + return true; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvProjection, size)) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionType, + static_cast(type_))) { + return false; + } + + if (private_data_length_ > 0 && private_data_ != NULL && + !WriteEbmlElement(writer, libwebm::kMkvProjectionPrivate, private_data_, + private_data_length_)) { + return false; + } + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseYaw, pose_yaw_)) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPosePitch, + pose_pitch_)) { + return false; + } + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseRoll, pose_roll_)) { + return false; + } + + return true; +} + +bool Projection::SetProjectionPrivate(const uint8_t* data, + uint64_t data_length) { + if (data == NULL || data_length == 0) { + return false; + } + + if (data_length != static_cast(data_length)) { + return false; + } + + uint8_t* new_private_data = + new (std::nothrow) uint8_t[static_cast(data_length)]; + if (new_private_data == NULL) { + return false; + } + + delete[] private_data_; + private_data_ = new_private_data; + private_data_length_ = data_length; + memcpy(private_data_, data, static_cast(data_length)); + + return true; +} + +uint64_t Projection::PayloadSize() const { + uint64_t size = + EbmlElementSize(libwebm::kMkvProjection, static_cast(type_)); + + if (private_data_length_ > 0 && private_data_ != NULL) { + size += EbmlElementSize(libwebm::kMkvProjectionPrivate, private_data_, + private_data_length_); + } + + size += EbmlElementSize(libwebm::kMkvProjectionPoseYaw, pose_yaw_); + size += EbmlElementSize(libwebm::kMkvProjectionPosePitch, pose_pitch_); + size += EbmlElementSize(libwebm::kMkvProjectionPoseRoll, pose_roll_); + + return size; +} + /////////////////////////////////////////////////////////////// // // VideoTrack Class @@ -1167,6 +1413,8 @@ VideoTrack::VideoTrack(unsigned int* seed) : Track(seed), display_height_(0), display_width_(0), + pixel_height_(0), + pixel_width_(0), crop_left_(0), crop_right_(0), crop_top_(0), @@ -1176,9 +1424,13 @@ VideoTrack::VideoTrack(unsigned int* seed) stereo_mode_(0), alpha_mode_(0), width_(0), - colour_(NULL) {} + colour_(NULL), + projection_(NULL) {} -VideoTrack::~VideoTrack() { delete colour_; } +VideoTrack::~VideoTrack() { + delete colour_; + delete projection_; +} bool VideoTrack::SetStereoMode(uint64_t stereo_mode) { if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst && @@ -1221,40 +1473,52 @@ bool VideoTrack::Write(IMkvWriter* writer) const { if (payload_position < 0) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvPixelWidth, width_)) + if (!WriteEbmlElement( + writer, libwebm::kMkvPixelWidth, + static_cast((pixel_width_ > 0) ? pixel_width_ : width_))) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvPixelHeight, height_)) + if (!WriteEbmlElement( + writer, libwebm::kMkvPixelHeight, + static_cast((pixel_height_ > 0) ? pixel_height_ : height_))) return false; if (display_width_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth, display_width_)) + if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth, + static_cast(display_width_))) return false; } if (display_height_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight, display_height_)) + if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight, + static_cast(display_height_))) return false; } if (crop_left_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft, crop_left_)) + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft, + static_cast(crop_left_))) return false; } if (crop_right_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight, crop_right_)) + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight, + static_cast(crop_right_))) return false; } if (crop_top_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop, crop_top_)) + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop, + static_cast(crop_top_))) return false; } if (crop_bottom_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom, crop_bottom_)) + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom, + static_cast(crop_bottom_))) return false; } if (stereo_mode_ > kMono) { - if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode, stereo_mode_)) + if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode, + static_cast(stereo_mode_))) return false; } if (alpha_mode_ > kNoAlpha) { - if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode, alpha_mode_)) + if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode, + static_cast(alpha_mode_))) return false; } if (frame_rate_ > 0.0) { @@ -1267,6 +1531,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const { if (!colour_->Write(writer)) return false; } + if (projection_) { + if (!projection_->Write(writer)) + return false; + } const int64_t stop_position = writer->Position(); if (stop_position < 0 || @@ -1287,47 +1555,83 @@ bool VideoTrack::SetColour(const Colour& colour) { return false; } - colour_ptr->matrix_coefficients = colour.matrix_coefficients; - colour_ptr->bits_per_channel = colour.bits_per_channel; - colour_ptr->chroma_subsampling_horz = colour.chroma_subsampling_horz; - colour_ptr->chroma_subsampling_vert = colour.chroma_subsampling_vert; - colour_ptr->cb_subsampling_horz = colour.cb_subsampling_horz; - colour_ptr->cb_subsampling_vert = colour.cb_subsampling_vert; - colour_ptr->chroma_siting_horz = colour.chroma_siting_horz; - colour_ptr->chroma_siting_vert = colour.chroma_siting_vert; - colour_ptr->range = colour.range; - colour_ptr->transfer_characteristics = colour.transfer_characteristics; - colour_ptr->primaries = colour.primaries; - colour_ptr->max_cll = colour.max_cll; - colour_ptr->max_fall = colour.max_fall; + colour_ptr->set_matrix_coefficients(colour.matrix_coefficients()); + colour_ptr->set_bits_per_channel(colour.bits_per_channel()); + colour_ptr->set_chroma_subsampling_horz(colour.chroma_subsampling_horz()); + colour_ptr->set_chroma_subsampling_vert(colour.chroma_subsampling_vert()); + colour_ptr->set_cb_subsampling_horz(colour.cb_subsampling_horz()); + colour_ptr->set_cb_subsampling_vert(colour.cb_subsampling_vert()); + colour_ptr->set_chroma_siting_horz(colour.chroma_siting_horz()); + colour_ptr->set_chroma_siting_vert(colour.chroma_siting_vert()); + colour_ptr->set_range(colour.range()); + colour_ptr->set_transfer_characteristics(colour.transfer_characteristics()); + colour_ptr->set_primaries(colour.primaries()); + colour_ptr->set_max_cll(colour.max_cll()); + colour_ptr->set_max_fall(colour.max_fall()); + delete colour_; colour_ = colour_ptr.release(); return true; } +bool VideoTrack::SetProjection(const Projection& projection) { + std::auto_ptr projection_ptr(new Projection()); + if (!projection_ptr.get()) + return false; + + if (projection.private_data()) { + if (!projection_ptr->SetProjectionPrivate( + projection.private_data(), projection.private_data_length())) { + return false; + } + } + + projection_ptr->set_type(projection.type()); + projection_ptr->set_pose_yaw(projection.pose_yaw()); + projection_ptr->set_pose_pitch(projection.pose_pitch()); + projection_ptr->set_pose_roll(projection.pose_roll()); + delete projection_; + projection_ = projection_ptr.release(); + return true; +} + uint64_t VideoTrack::VideoPayloadSize() const { - uint64_t size = EbmlElementSize(libwebm::kMkvPixelWidth, width_); - size += EbmlElementSize(libwebm::kMkvPixelHeight, height_); + uint64_t size = EbmlElementSize( + libwebm::kMkvPixelWidth, + static_cast((pixel_width_ > 0) ? pixel_width_ : width_)); + size += EbmlElementSize( + libwebm::kMkvPixelHeight, + static_cast((pixel_height_ > 0) ? pixel_height_ : height_)); if (display_width_ > 0) - size += EbmlElementSize(libwebm::kMkvDisplayWidth, display_width_); + size += EbmlElementSize(libwebm::kMkvDisplayWidth, + static_cast(display_width_)); if (display_height_ > 0) - size += EbmlElementSize(libwebm::kMkvDisplayHeight, display_height_); + size += EbmlElementSize(libwebm::kMkvDisplayHeight, + static_cast(display_height_)); if (crop_left_ > 0) - size += EbmlElementSize(libwebm::kMkvPixelCropLeft, crop_left_); + size += EbmlElementSize(libwebm::kMkvPixelCropLeft, + static_cast(crop_left_)); if (crop_right_ > 0) - size += EbmlElementSize(libwebm::kMkvPixelCropRight, crop_right_); + size += EbmlElementSize(libwebm::kMkvPixelCropRight, + static_cast(crop_right_)); if (crop_top_ > 0) - size += EbmlElementSize(libwebm::kMkvPixelCropTop, crop_top_); + size += EbmlElementSize(libwebm::kMkvPixelCropTop, + static_cast(crop_top_)); if (crop_bottom_ > 0) - size += EbmlElementSize(libwebm::kMkvPixelCropBottom, crop_bottom_); + size += EbmlElementSize(libwebm::kMkvPixelCropBottom, + static_cast(crop_bottom_)); if (stereo_mode_ > kMono) - size += EbmlElementSize(libwebm::kMkvStereoMode, stereo_mode_); + size += EbmlElementSize(libwebm::kMkvStereoMode, + static_cast(stereo_mode_)); if (alpha_mode_ > kNoAlpha) - size += EbmlElementSize(libwebm::kMkvAlphaMode, alpha_mode_); + size += EbmlElementSize(libwebm::kMkvAlphaMode, + static_cast(alpha_mode_)); if (frame_rate_ > 0.0) size += EbmlElementSize(libwebm::kMkvFrameRate, static_cast(frame_rate_)); if (colour_) size += colour_->ColourSize(); + if (projection_) + size += projection_->ProjectionSize(); return size; } @@ -1346,9 +1650,11 @@ uint64_t AudioTrack::PayloadSize() const { uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency, static_cast(sample_rate_)); - size += EbmlElementSize(libwebm::kMkvChannels, channels_); + size += + EbmlElementSize(libwebm::kMkvChannels, static_cast(channels_)); if (bit_depth_ > 0) - size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_); + size += + EbmlElementSize(libwebm::kMkvBitDepth, static_cast(bit_depth_)); size += EbmlMasterElementSize(libwebm::kMkvAudio, size); return parent_size + size; @@ -1361,9 +1667,11 @@ bool AudioTrack::Write(IMkvWriter* writer) const { // Calculate AudioSettings size. uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency, static_cast(sample_rate_)); - size += EbmlElementSize(libwebm::kMkvChannels, channels_); + size += + EbmlElementSize(libwebm::kMkvChannels, static_cast(channels_)); if (bit_depth_ > 0) - size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_); + size += + EbmlElementSize(libwebm::kMkvBitDepth, static_cast(bit_depth_)); if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size)) return false; @@ -1375,10 +1683,12 @@ bool AudioTrack::Write(IMkvWriter* writer) const { if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency, static_cast(sample_rate_))) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvChannels, channels_)) + if (!WriteEbmlElement(writer, libwebm::kMkvChannels, + static_cast(channels_))) return false; if (bit_depth_ > 0) - if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth, bit_depth_)) + if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth, + static_cast(bit_depth_))) return false; const int64_t stop_position = writer->Position(); @@ -1398,6 +1708,10 @@ const char Tracks::kVorbisCodecId[] = "A_VORBIS"; const char Tracks::kVp8CodecId[] = "V_VP8"; const char Tracks::kVp9CodecId[] = "V_VP9"; const char Tracks::kVp10CodecId[] = "V_VP10"; +const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS"; +const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS"; +const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA"; +const char Tracks::kWebVttSubtitlesId[] = "D_WEBVTT/SUBTITLES"; Tracks::Tracks() : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {} @@ -1650,9 +1964,11 @@ bool Chapter::ExpandDisplaysArray() { uint64_t Chapter::WriteAtom(IMkvWriter* writer) const { uint64_t payload_size = EbmlElementSize(libwebm::kMkvChapterStringUID, id_) + - EbmlElementSize(libwebm::kMkvChapterUID, uid_) + - EbmlElementSize(libwebm::kMkvChapterTimeStart, start_timecode_) + - EbmlElementSize(libwebm::kMkvChapterTimeEnd, end_timecode_); + EbmlElementSize(libwebm::kMkvChapterUID, static_cast(uid_)) + + EbmlElementSize(libwebm::kMkvChapterTimeStart, + static_cast(start_timecode_)) + + EbmlElementSize(libwebm::kMkvChapterTimeEnd, + static_cast(end_timecode_)); for (int idx = 0; idx < displays_count_; ++idx) { const Display& d = displays_[idx]; @@ -1674,13 +1990,16 @@ uint64_t Chapter::WriteAtom(IMkvWriter* writer) const { if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_)) return 0; - if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID, uid_)) + if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID, + static_cast(uid_))) return 0; - if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart, start_timecode_)) + if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart, + static_cast(start_timecode_))) return 0; - if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd, end_timecode_)) + if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd, + static_cast(end_timecode_))) return 0; for (int idx = 0; idx < displays_count_; ++idx) { @@ -2125,7 +2444,17 @@ Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale, write_last_frame_with_duration_(write_last_frame_with_duration), writer_(NULL) {} -Cluster::~Cluster() {} +Cluster::~Cluster() { + // Delete any stored frames that are left behind. This will happen if the + // Cluster was not Finalized for whatever reason. + while (!stored_frames_.empty()) { + while (!stored_frames_.begin()->second.empty()) { + delete stored_frames_.begin()->second.front(); + stored_frames_.begin()->second.pop_front(); + } + stored_frames_.erase(stored_frames_.begin()->first); + } +} bool Cluster::Init(IMkvWriter* ptr_writer) { if (!ptr_writer) { @@ -2421,10 +2750,10 @@ bool SeekHead::Finalize(IMkvWriter* writer) const { for (int32_t i = 0; i < kSeekEntryCount; ++i) { if (seek_entry_id_[i] != 0) { - entry_size[i] = EbmlElementSize( - libwebm::kMkvSeekID, static_cast(seek_entry_id_[i])); - entry_size[i] += - EbmlElementSize(libwebm::kMkvSeekPosition, seek_entry_pos_[i]); + entry_size[i] = EbmlElementSize(libwebm::kMkvSeekID, + static_cast(seek_entry_id_[i])); + entry_size[i] += EbmlElementSize( + libwebm::kMkvSeekPosition, static_cast(seek_entry_pos_[i])); payload_size += EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) + @@ -2449,11 +2778,11 @@ bool SeekHead::Finalize(IMkvWriter* writer) const { return false; if (!WriteEbmlElement(writer, libwebm::kMkvSeekID, - static_cast(seek_entry_id_[i]))) + static_cast(seek_entry_id_[i]))) return false; if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition, - seek_entry_pos_[i])) + static_cast(seek_entry_pos_[i]))) return false; } } @@ -2522,8 +2851,10 @@ bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) { uint64_t SeekHead::MaxEntrySize() const { const uint64_t max_entry_payload_size = - EbmlElementSize(libwebm::kMkvSeekID, UINT64_C(0xffffffff)) + - EbmlElementSize(libwebm::kMkvSeekPosition, UINT64_C(0xffffffffffffffff)); + EbmlElementSize(libwebm::kMkvSeekID, + static_cast(UINT64_C(0xffffffff))) + + EbmlElementSize(libwebm::kMkvSeekPosition, + static_cast(UINT64_C(0xffffffffffffffff))); const uint64_t max_entry_size = EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) + max_entry_payload_size; @@ -2613,7 +2944,8 @@ bool SegmentInfo::Write(IMkvWriter* writer) { if (!writer || !muxing_app_ || !writing_app_) return false; - uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale, timecode_scale_); + uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale, + static_cast(timecode_scale_)); if (duration_ > 0.0) size += EbmlElementSize(libwebm::kMkvDuration, static_cast(duration_)); @@ -2629,7 +2961,8 @@ bool SegmentInfo::Write(IMkvWriter* writer) { if (payload_position < 0) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale, timecode_scale_)) + if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale, + static_cast(timecode_scale_))) return false; if (duration_ > 0.0) { @@ -2725,10 +3058,12 @@ Segment::Segment() output_cues_(true), accurate_cluster_duration_(false), fixed_size_cluster_timecode_(false), + estimate_file_duration_(false), payload_pos_(0), size_position_(0), doc_type_version_(kDefaultDocTypeVersion), doc_type_version_written_(0), + duration_(0.0), writer_cluster_(NULL), writer_cues_(NULL), writer_header_(NULL) { @@ -2833,6 +3168,10 @@ bool Segment::Init(IMkvWriter* ptr_writer) { writer_cluster_ = ptr_writer; writer_cues_ = ptr_writer; writer_header_ = ptr_writer; + memset(&track_frames_written_, 0, + sizeof(track_frames_written_[0]) * kMaxTrackNumber); + memset(&last_track_timestamp_, 0, + sizeof(last_track_timestamp_[0]) * kMaxTrackNumber); return segment_info_.Init(); } @@ -2876,7 +3215,10 @@ bool Segment::Finalize() { if (WriteFramesAll() < 0) return false; - if (cluster_list_size_ > 0) { + // In kLive mode, call Cluster::Finalize only if |accurate_cluster_duration_| + // is set. In all other modes, always call Cluster::Finalize. + if ((mode_ == kLive ? accurate_cluster_duration_ : true) && + cluster_list_size_ > 0) { // Update last cluster's size Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1]; @@ -2892,9 +3234,30 @@ bool Segment::Finalize() { chunk_count_++; } - const double duration = + double duration = (static_cast(last_timestamp_) + last_block_duration_) / segment_info_.timecode_scale(); + if (duration_ > 0.0) { + duration = duration_; + } else { + if (last_block_duration_ == 0 && estimate_file_duration_) { + const int num_tracks = static_cast(tracks_.track_entries_size()); + for (int i = 0; i < num_tracks; ++i) { + if (track_frames_written_[i] < 2) + continue; + + // Estimate the duration for the last block of a Track. + const double nano_per_frame = + static_cast(last_track_timestamp_[i]) / + (track_frames_written_[i] - 1); + const double track_duration = + (last_track_timestamp_[i] + nano_per_frame) / + segment_info_.timecode_scale(); + if (track_duration > duration) + duration = track_duration; + } + } + } segment_info_.set_duration(duration); if (!segment_info_.Finalize(writer_header_)) return false; @@ -2941,7 +3304,9 @@ bool Segment::Finalize() { if (writer_header_->Position(0)) return false; - if (!WriteEbmlHeader(writer_header_, doc_type_version_)) + const char* const doc_type = + DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska; + if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type)) return false; if (writer_header_->Position() != ebml_header_size_) return false; @@ -3001,7 +3366,10 @@ uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) { track->set_width(width); track->set_height(height); - tracks_.AddTrack(track, number); + if (!tracks_.AddTrack(track, number)) { + delete track; + return 0; + } has_video_ = true; return track->number(); @@ -3023,8 +3391,10 @@ bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) { cue->set_block_number(cluster->blocks_added()); cue->set_cluster_pos(cluster->position_for_cues()); cue->set_track(track); - if (!cues_.AddCue(cue)) + if (!cues_.AddCue(cue)) { + delete cue; return false; + } new_cuepoint_ = false; return true; @@ -3041,7 +3411,10 @@ uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels, track->set_sample_rate(sample_rate); track->set_channels(channels); - tracks_.AddTrack(track, number); + if (!tracks_.AddTrack(track, number)) { + delete track; + return 0; + } return track->number(); } @@ -3130,15 +3503,35 @@ bool Segment::AddGenericFrame(const Frame* frame) { if (frame->discard_padding() != 0) doc_type_version_ = 4; + if (cluster_list_size_ > 0) { + const uint64_t timecode_scale = segment_info_.timecode_scale(); + const uint64_t frame_timecode = frame->timestamp() / timecode_scale; + + const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1]; + const uint64_t last_cluster_timecode = last_cluster->timecode(); + + const uint64_t rel_timecode = frame_timecode - last_cluster_timecode; + if (rel_timecode > kMaxBlockTimecode) { + force_new_cluster_ = true; + } + } + // If the segment has a video track hold onto audio frames to make sure the // audio that is associated with the start time of a video key-frame is // muxed into the same cluster. if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) && !force_new_cluster_) { Frame* const new_frame = new (std::nothrow) Frame(); - if (!new_frame || !new_frame->CopyFrom(*frame)) + if (!new_frame || !new_frame->CopyFrom(*frame)) { + delete new_frame; return false; - return QueueFrame(new_frame); + } + if (!QueueFrame(new_frame)) { + delete new_frame; + return false; + } + track_frames_written_[frame->track_number() - 1]++; + return true; } if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(), @@ -3159,8 +3552,10 @@ bool Segment::AddGenericFrame(const Frame* frame) { if (!frame->CanBeSimpleBlock() && !frame->is_key() && !frame->reference_block_timestamp_set()) { Frame* const new_frame = new (std::nothrow) Frame(); - if (!new_frame->CopyFrom(*frame)) + if (!new_frame || !new_frame->CopyFrom(*frame)) { + delete new_frame; return false; + } new_frame->set_reference_block_timestamp( last_track_timestamp_[frame->track_number() - 1]); frame = new_frame; @@ -3178,10 +3573,10 @@ bool Segment::AddGenericFrame(const Frame* frame) { last_timestamp_ = frame->timestamp(); last_track_timestamp_[frame->track_number() - 1] = frame->timestamp(); last_block_duration_ = frame->duration(); + track_frames_written_[frame->track_number() - 1]++; if (frame_created) delete frame; - return true; } @@ -3292,8 +3687,9 @@ Track* Segment::GetTrackByNumber(uint64_t track_number) const { bool Segment::WriteSegmentHeader() { UpdateDocTypeVersion(); - // TODO(fgalligan): Support more than one segment. - if (!WriteEbmlHeader(writer_header_, doc_type_version_)) + const char* const doc_type = + DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska; + if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type)) return false; doc_type_version_written_ = doc_type_version_; ebml_header_size_ = static_cast(writer_header_->Position()); @@ -3766,4 +4162,35 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) { return true; } +bool Segment::DocTypeIsWebm() const { + const int kNumCodecIds = 9; + + // TODO(vigneshv): Tweak .clang-format. + const char* kWebmCodecIds[kNumCodecIds] = { + Tracks::kOpusCodecId, Tracks::kVorbisCodecId, + Tracks::kVp8CodecId, Tracks::kVp9CodecId, + Tracks::kVp10CodecId, Tracks::kWebVttCaptionsId, + Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId, + Tracks::kWebVttSubtitlesId}; + + const int num_tracks = static_cast(tracks_.track_entries_size()); + for (int track_index = 0; track_index < num_tracks; ++track_index) { + const Track* const track = tracks_.GetTrackByIndex(track_index); + const std::string codec_id = track->codec_id(); + + bool id_is_webm = false; + for (int id_index = 0; id_index < kNumCodecIds; ++id_index) { + if (codec_id == kWebmCodecIds[id_index]) { + id_is_webm = true; + break; + } + } + + if (!id_is_webm) + return false; + } + + return true; +} + } // namespace mkvmuxer diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h index 55ba07196d..46b0029dc4 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h @@ -64,6 +64,12 @@ class IMkvWriter { LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter); }; +// Writes out the EBML header for a WebM file, but allows caller to specify +// DocType. This function must be called before any other libwebm writing +// functions are called. +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version, + const char* const doc_type); + // Writes out the EBML header for a WebM file. This function must be called // before any other libwebm writing functions are called. bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version); @@ -348,26 +354,42 @@ class ContentEncoding { /////////////////////////////////////////////////////////////// // Colour element. -struct PrimaryChromaticity { - PrimaryChromaticity(float x_val, float y_val) : x(x_val), y(y_val) {} - PrimaryChromaticity() : x(0), y(0) {} +class PrimaryChromaticity { + public: + static const float kChromaticityMin; + static const float kChromaticityMax; + + PrimaryChromaticity(float x_val, float y_val) : x_(x_val), y_(y_val) {} + PrimaryChromaticity() : x_(0), y_(0) {} ~PrimaryChromaticity() {} - uint64_t PrimaryChromaticityPayloadSize(libwebm::MkvId x_id, - libwebm::MkvId y_id) const; + + // Returns sum of |x_id| and |y_id| element id sizes and payload sizes. + uint64_t PrimaryChromaticitySize(libwebm::MkvId x_id, + libwebm::MkvId y_id) const; + bool Valid() const; bool Write(IMkvWriter* writer, libwebm::MkvId x_id, libwebm::MkvId y_id) const; - float x; - float y; + float x() const { return x_; } + void set_x(float new_x) { x_ = new_x; } + float y() const { return y_; } + void set_y(float new_y) { y_ = new_y; } + + private: + float x_; + float y_; }; class MasteringMetadata { public: static const float kValueNotPresent; + static const float kMinLuminance; + static const float kMinLuminanceMax; + static const float kMaxLuminanceMax; MasteringMetadata() - : luminance_max(kValueNotPresent), - luminance_min(kValueNotPresent), + : luminance_max_(kValueNotPresent), + luminance_min_(kValueNotPresent), r_(NULL), g_(NULL), b_(NULL), @@ -381,6 +403,7 @@ class MasteringMetadata { // Returns total size of the MasteringMetadata element. uint64_t MasteringMetadataSize() const; + bool Valid() const; bool Write(IMkvWriter* writer) const; // Copies non-null chromaticity. @@ -393,13 +416,21 @@ class MasteringMetadata { const PrimaryChromaticity* b() const { return b_; } const PrimaryChromaticity* white_point() const { return white_point_; } - float luminance_max; - float luminance_min; + float luminance_max() const { return luminance_max_; } + void set_luminance_max(float luminance_max) { + luminance_max_ = luminance_max; + } + float luminance_min() const { return luminance_min_; } + void set_luminance_min(float luminance_min) { + luminance_min_ = luminance_min; + } private: // Returns size of MasteringMetadata child elements. uint64_t PayloadSize() const; + float luminance_max_; + float luminance_min_; PrimaryChromaticity* r_; PrimaryChromaticity* g_; PrimaryChromaticity* b_; @@ -408,26 +439,90 @@ class MasteringMetadata { class Colour { public: + enum MatrixCoefficients { + kGbr = 0, + kBt709 = 1, + kUnspecifiedMc = 2, + kReserved = 3, + kFcc = 4, + kBt470bg = 5, + kSmpte170MMc = 6, + kSmpte240MMc = 7, + kYcocg = 8, + kBt2020NonConstantLuminance = 9, + kBt2020ConstantLuminance = 10, + }; + enum ChromaSitingHorz { + kUnspecifiedCsh = 0, + kLeftCollocated = 1, + kHalfCsh = 2, + }; + enum ChromaSitingVert { + kUnspecifiedCsv = 0, + kTopCollocated = 1, + kHalfCsv = 2, + }; + enum Range { + kUnspecifiedCr = 0, + kBroadcastRange = 1, + kFullRange = 2, + kMcTcDefined = 3, // Defined by MatrixCoefficients/TransferCharacteristics. + }; + enum TransferCharacteristics { + kIturBt709Tc = 1, + kUnspecifiedTc = 2, + kReservedTc = 3, + kGamma22Curve = 4, + kGamma28Curve = 5, + kSmpte170MTc = 6, + kSmpte240MTc = 7, + kLinear = 8, + kLog = 9, + kLogSqrt = 10, + kIec6196624 = 11, + kIturBt1361ExtendedColourGamut = 12, + kIec6196621 = 13, + kIturBt202010bit = 14, + kIturBt202012bit = 15, + kSmpteSt2084 = 16, + kSmpteSt4281Tc = 17, + kAribStdB67Hlg = 18, + }; + enum Primaries { + kReservedP0 = 0, + kIturBt709P = 1, + kUnspecifiedP = 2, + kReservedP3 = 3, + kIturBt470M = 4, + kIturBt470Bg = 5, + kSmpte170MP = 6, + kSmpte240MP = 7, + kFilm = 8, + kIturBt2020 = 9, + kSmpteSt4281P = 10, + kJedecP22Phosphors = 22, + }; static const uint64_t kValueNotPresent; Colour() - : matrix_coefficients(kValueNotPresent), - bits_per_channel(kValueNotPresent), - chroma_subsampling_horz(kValueNotPresent), - chroma_subsampling_vert(kValueNotPresent), - cb_subsampling_horz(kValueNotPresent), - cb_subsampling_vert(kValueNotPresent), - chroma_siting_horz(kValueNotPresent), - chroma_siting_vert(kValueNotPresent), - range(kValueNotPresent), - transfer_characteristics(kValueNotPresent), - primaries(kValueNotPresent), - max_cll(kValueNotPresent), - max_fall(kValueNotPresent), + : matrix_coefficients_(kValueNotPresent), + bits_per_channel_(kValueNotPresent), + chroma_subsampling_horz_(kValueNotPresent), + chroma_subsampling_vert_(kValueNotPresent), + cb_subsampling_horz_(kValueNotPresent), + cb_subsampling_vert_(kValueNotPresent), + chroma_siting_horz_(kValueNotPresent), + chroma_siting_vert_(kValueNotPresent), + range_(kValueNotPresent), + transfer_characteristics_(kValueNotPresent), + primaries_(kValueNotPresent), + max_cll_(kValueNotPresent), + max_fall_(kValueNotPresent), mastering_metadata_(NULL) {} ~Colour() { delete mastering_metadata_; } // Returns total size of the Colour element. uint64_t ColourSize() const; + bool Valid() const; bool Write(IMkvWriter* writer) const; // Deep copies |mastering_metadata|. @@ -437,27 +532,124 @@ class Colour { return mastering_metadata_; } - uint64_t matrix_coefficients; - uint64_t bits_per_channel; - uint64_t chroma_subsampling_horz; - uint64_t chroma_subsampling_vert; - uint64_t cb_subsampling_horz; - uint64_t cb_subsampling_vert; - uint64_t chroma_siting_horz; - uint64_t chroma_siting_vert; - uint64_t range; - uint64_t transfer_characteristics; - uint64_t primaries; - uint64_t max_cll; - uint64_t max_fall; + uint64_t matrix_coefficients() const { return matrix_coefficients_; } + void set_matrix_coefficients(uint64_t matrix_coefficients) { + matrix_coefficients_ = matrix_coefficients; + } + uint64_t bits_per_channel() const { return bits_per_channel_; } + void set_bits_per_channel(uint64_t bits_per_channel) { + bits_per_channel_ = bits_per_channel; + } + uint64_t chroma_subsampling_horz() const { return chroma_subsampling_horz_; } + void set_chroma_subsampling_horz(uint64_t chroma_subsampling_horz) { + chroma_subsampling_horz_ = chroma_subsampling_horz; + } + uint64_t chroma_subsampling_vert() const { return chroma_subsampling_vert_; } + void set_chroma_subsampling_vert(uint64_t chroma_subsampling_vert) { + chroma_subsampling_vert_ = chroma_subsampling_vert; + } + uint64_t cb_subsampling_horz() const { return cb_subsampling_horz_; } + void set_cb_subsampling_horz(uint64_t cb_subsampling_horz) { + cb_subsampling_horz_ = cb_subsampling_horz; + } + uint64_t cb_subsampling_vert() const { return cb_subsampling_vert_; } + void set_cb_subsampling_vert(uint64_t cb_subsampling_vert) { + cb_subsampling_vert_ = cb_subsampling_vert; + } + uint64_t chroma_siting_horz() const { return chroma_siting_horz_; } + void set_chroma_siting_horz(uint64_t chroma_siting_horz) { + chroma_siting_horz_ = chroma_siting_horz; + } + uint64_t chroma_siting_vert() const { return chroma_siting_vert_; } + void set_chroma_siting_vert(uint64_t chroma_siting_vert) { + chroma_siting_vert_ = chroma_siting_vert; + } + uint64_t range() const { return range_; } + void set_range(uint64_t range) { range_ = range; } + uint64_t transfer_characteristics() const { + return transfer_characteristics_; + } + void set_transfer_characteristics(uint64_t transfer_characteristics) { + transfer_characteristics_ = transfer_characteristics; + } + uint64_t primaries() const { return primaries_; } + void set_primaries(uint64_t primaries) { primaries_ = primaries; } + uint64_t max_cll() const { return max_cll_; } + void set_max_cll(uint64_t max_cll) { max_cll_ = max_cll; } + uint64_t max_fall() const { return max_fall_; } + void set_max_fall(uint64_t max_fall) { max_fall_ = max_fall; } private: // Returns size of Colour child elements. uint64_t PayloadSize() const; + uint64_t matrix_coefficients_; + uint64_t bits_per_channel_; + uint64_t chroma_subsampling_horz_; + uint64_t chroma_subsampling_vert_; + uint64_t cb_subsampling_horz_; + uint64_t cb_subsampling_vert_; + uint64_t chroma_siting_horz_; + uint64_t chroma_siting_vert_; + uint64_t range_; + uint64_t transfer_characteristics_; + uint64_t primaries_; + uint64_t max_cll_; + uint64_t max_fall_; + MasteringMetadata* mastering_metadata_; }; +/////////////////////////////////////////////////////////////// +// Projection element. +class Projection { + public: + enum ProjectionType { + kTypeNotPresent = -1, + kRectangular = 0, + kEquirectangular = 1, + kCubeMap = 2, + kMesh = 3, + }; + static const uint64_t kValueNotPresent; + Projection() + : type_(kRectangular), + pose_yaw_(0.0), + pose_pitch_(0.0), + pose_roll_(0.0), + private_data_(NULL), + private_data_length_(0) {} + ~Projection() { delete[] private_data_; } + + uint64_t ProjectionSize() const; + bool Write(IMkvWriter* writer) const; + + bool SetProjectionPrivate(const uint8_t* private_data, + uint64_t private_data_length); + + ProjectionType type() const { return type_; } + void set_type(ProjectionType type) { type_ = type; } + float pose_yaw() const { return pose_yaw_; } + void set_pose_yaw(float pose_yaw) { pose_yaw_ = pose_yaw; } + float pose_pitch() const { return pose_pitch_; } + void set_pose_pitch(float pose_pitch) { pose_pitch_ = pose_pitch; } + float pose_roll() const { return pose_roll_; } + void set_pose_roll(float pose_roll) { pose_roll_ = pose_roll; } + uint8_t* private_data() const { return private_data_; } + uint64_t private_data_length() const { return private_data_length_; } + + private: + // Returns size of VideoProjection child elements. + uint64_t PayloadSize() const; + + ProjectionType type_; + float pose_yaw_; + float pose_pitch_; + float pose_roll_; + uint8_t* private_data_; + uint64_t private_data_length_; +}; + /////////////////////////////////////////////////////////////// // Track element. class Track { @@ -581,6 +773,10 @@ class VideoTrack : public Track { uint64_t display_height() const { return display_height_; } void set_display_width(uint64_t width) { display_width_ = width; } uint64_t display_width() const { return display_width_; } + void set_pixel_height(uint64_t height) { pixel_height_ = height; } + uint64_t pixel_height() const { return pixel_height_; } + void set_pixel_width(uint64_t width) { pixel_width_ = width; } + uint64_t pixel_width() const { return pixel_width_; } void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; } uint64_t crop_left() const { return crop_left_; } @@ -605,6 +801,11 @@ class VideoTrack : public Track { // Deep copies |colour|. bool SetColour(const Colour& colour); + Projection* projection() { return projection_; } + + // Deep copies |projection|. + bool SetProjection(const Projection& projection); + private: // Returns the size in bytes of the Video element. uint64_t VideoPayloadSize() const; @@ -612,6 +813,8 @@ class VideoTrack : public Track { // Video track element names. uint64_t display_height_; uint64_t display_width_; + uint64_t pixel_height_; + uint64_t pixel_width_; uint64_t crop_left_; uint64_t crop_right_; uint64_t crop_top_; @@ -623,6 +826,7 @@ class VideoTrack : public Track { uint64_t width_; Colour* colour_; + Projection* projection_; LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack); }; @@ -670,6 +874,10 @@ class Tracks { static const char kVp8CodecId[]; static const char kVp9CodecId[]; static const char kVp10CodecId[]; + static const char kWebVttCaptionsId[]; + static const char kWebVttDescriptionsId[]; + static const char kWebVttMetadataId[]; + static const char kWebVttSubtitlesId[]; Tracks(); ~Tracks(); @@ -1294,8 +1502,8 @@ class Segment { kBeforeClusters = 0x1 // Position Cues before Clusters }; - const static uint32_t kDefaultDocTypeVersion = 2; - const static uint64_t kDefaultMaxClusterDuration = 30000000000ULL; + static const uint32_t kDefaultDocTypeVersion = 4; + static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL; Segment(); ~Segment(); @@ -1481,7 +1689,16 @@ class Segment { Mode mode() const { return mode_; } CuesPosition cues_position() const { return cues_position_; } bool output_cues() const { return output_cues_; } + void set_estimate_file_duration(bool estimate_duration) { + estimate_file_duration_ = estimate_duration; + } + bool estimate_file_duration() const { return estimate_file_duration_; } const SegmentInfo* segment_info() const { return &segment_info_; } + void set_duration(double duration) { duration_ = duration; } + double duration() const { return duration_; } + + // Returns true when codec IDs are valid for WebM. + bool DocTypeIsWebm() const; private: // Checks if header information has been output and initialized. If not it @@ -1637,6 +1854,9 @@ class Segment { // Last timestamp in nanoseconds by track number added to a cluster. uint64_t last_track_timestamp_[kMaxTrackNumber]; + // Number of frames written per track. + uint64_t track_frames_written_[kMaxTrackNumber]; + // Maximum time in nanoseconds for a cluster duration. This variable is a // guideline and some clusters may have a longer duration. Default is 30 // seconds. @@ -1665,6 +1885,9 @@ class Segment { // Flag whether or not to write the Cluster Timecode using exactly 8 bytes. bool fixed_size_cluster_timecode_; + // Flag whether or not to estimate the file duration. + bool estimate_file_duration_; + // The size of the EBML header, used to validate the header if // WriteEbmlHeader() is called more than once. int32_t ebml_header_size_; @@ -1682,6 +1905,9 @@ class Segment { uint32_t doc_type_version_; uint32_t doc_type_version_written_; + // If |duration_| is > 0, then explicitly set the duration of the segment. + double duration_; + // Pointer to the writer objects. Not owned by this class. IMkvWriter* writer_cluster_; IMkvWriter* writer_cues_; diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index 3562b8ab82..355d4e22b3 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -10,6 +10,7 @@ #ifdef __ANDROID__ #include +#include #endif #include @@ -31,20 +32,20 @@ namespace { // Date elements are always 8 octets in size. const int kDateElementSize = 8; -uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, - int64_t timecode, uint64_t timecode_scale) { - uint64_t block_additional_elem_size = 0; - uint64_t block_addid_elem_size = 0; - uint64_t block_more_payload_size = 0; - uint64_t block_more_elem_size = 0; - uint64_t block_additions_payload_size = 0; - uint64_t block_additions_elem_size = 0; +uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode, + uint64 timecode_scale) { + uint64 block_additional_elem_size = 0; + uint64 block_addid_elem_size = 0; + uint64 block_more_payload_size = 0; + uint64 block_more_elem_size = 0; + uint64 block_additions_payload_size = 0; + uint64 block_additions_elem_size = 0; if (frame->additional()) { block_additional_elem_size = EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(), frame->additional_length()); - block_addid_elem_size = - EbmlElementSize(libwebm::kMkvBlockAddID, frame->add_id()); + block_addid_elem_size = EbmlElementSize( + libwebm::kMkvBlockAddID, static_cast(frame->add_id())); block_more_payload_size = block_addid_elem_size + block_additional_elem_size; @@ -58,32 +59,33 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, block_additions_payload_size; } - uint64_t discard_padding_elem_size = 0; + uint64 discard_padding_elem_size = 0; if (frame->discard_padding() != 0) { discard_padding_elem_size = - EbmlElementSize(libwebm::kMkvDiscardPadding, frame->discard_padding()); + EbmlElementSize(libwebm::kMkvDiscardPadding, + static_cast(frame->discard_padding())); } - const uint64_t reference_block_timestamp = + const uint64 reference_block_timestamp = frame->reference_block_timestamp() / timecode_scale; - uint64_t reference_block_elem_size = 0; + uint64 reference_block_elem_size = 0; if (!frame->is_key()) { reference_block_elem_size = EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp); } - const uint64_t duration = frame->duration() / timecode_scale; - uint64_t block_duration_elem_size = 0; + const uint64 duration = frame->duration() / timecode_scale; + uint64 block_duration_elem_size = 0; if (duration > 0) block_duration_elem_size = EbmlElementSize(libwebm::kMkvBlockDuration, duration); - const uint64_t block_payload_size = 4 + frame->length(); - const uint64_t block_elem_size = + const uint64 block_payload_size = 4 + frame->length(); + const uint64 block_elem_size = EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) + block_payload_size; - const uint64_t block_group_payload_size = + const uint64 block_group_payload_size = block_elem_size + block_additions_elem_size + block_duration_elem_size + discard_padding_elem_size + reference_block_elem_size; @@ -105,7 +107,7 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, if (SerializeInt(writer, 0, 1)) return 0; - if (writer->Write(frame->frame(), static_cast(frame->length()))) + if (writer->Write(frame->frame(), static_cast(frame->length()))) return 0; if (frame->additional()) { @@ -118,7 +120,8 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, block_more_payload_size)) return 0; - if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, frame->add_id())) + if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, + static_cast(frame->add_id()))) return 0; if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional, @@ -129,7 +132,7 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, if (frame->discard_padding() != 0 && !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding, - frame->discard_padding())) { + static_cast(frame->discard_padding()))) { return false; } @@ -148,38 +151,38 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, block_group_payload_size; } -uint64_t WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame, - int64_t timecode) { +uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame, + int64 timecode) { if (WriteID(writer, libwebm::kMkvSimpleBlock)) return 0; - const int32_t size = static_cast(frame->length()) + 4; + const int32 size = static_cast(frame->length()) + 4; if (WriteUInt(writer, size)) return 0; - if (WriteUInt(writer, static_cast(frame->track_number()))) + if (WriteUInt(writer, static_cast(frame->track_number()))) return 0; if (SerializeInt(writer, timecode, 2)) return 0; - uint64_t flags = 0; + uint64 flags = 0; if (frame->is_key()) flags |= 0x80; if (SerializeInt(writer, flags, 1)) return 0; - if (writer->Write(frame->frame(), static_cast(frame->length()))) + if (writer->Write(frame->frame(), static_cast(frame->length()))) return 0; - return static_cast(GetUIntSize(libwebm::kMkvSimpleBlock) + - GetCodedUIntSize(size) + 4 + frame->length()); + return GetUIntSize(libwebm::kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 + + frame->length(); } } // namespace -int32_t GetCodedUIntSize(uint64_t value) { +int32 GetCodedUIntSize(uint64 value) { if (value < 0x000000000000007FULL) return 1; else if (value < 0x0000000000003FFFULL) @@ -197,7 +200,7 @@ int32_t GetCodedUIntSize(uint64_t value) { return 8; } -int32_t GetUIntSize(uint64_t value) { +int32 GetUIntSize(uint64 value) { if (value < 0x0000000000000100ULL) return 1; else if (value < 0x0000000000010000ULL) @@ -215,26 +218,26 @@ int32_t GetUIntSize(uint64_t value) { return 8; } -int32_t GetIntSize(int64_t value) { +int32 GetIntSize(int64 value) { // Doubling the requested value ensures positive values with their high bit // set are written with 0-padding to avoid flipping the signedness. - const uint64_t v = (value < 0) ? value ^ -1LL : value; + const uint64 v = (value < 0) ? value ^ -1LL : value; return GetUIntSize(2 * v); } -uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value) { +uint64 EbmlMasterElementSize(uint64 type, uint64 value) { // Size of EBML ID - int32_t ebml_size = GetUIntSize(type); + int32 ebml_size = GetUIntSize(type); // Datasize ebml_size += GetCodedUIntSize(value); - return static_cast(ebml_size); + return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, int64_t value) { +uint64 EbmlElementSize(uint64 type, int64 value) { // Size of EBML ID - int32_t ebml_size = GetUIntSize(type); + int32 ebml_size = GetUIntSize(type); // Datasize ebml_size += GetIntSize(value); @@ -242,20 +245,19 @@ uint64_t EbmlElementSize(uint64_t type, int64_t value) { // Size of Datasize ebml_size++; - return static_cast(ebml_size); + return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, uint64_t value) { +uint64 EbmlElementSize(uint64 type, uint64 value) { return EbmlElementSize(type, value, 0); } -uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) { +uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size) { // Size of EBML ID - uint64_t ebml_size = static_cast(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize - ebml_size += - (fixed_size > 0) ? fixed_size : static_cast(GetUIntSize(value)); + ebml_size += (fixed_size > 0) ? fixed_size : GetUIntSize(value); // Size of Datasize ebml_size++; @@ -263,9 +265,9 @@ uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) { return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, float /* value */) { +uint64 EbmlElementSize(uint64 type, float /* value */) { // Size of EBML ID - uint64_t ebml_size = static_cast(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize ebml_size += sizeof(float); @@ -276,28 +278,28 @@ uint64_t EbmlElementSize(uint64_t type, float /* value */) { return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, const char* value) { +uint64 EbmlElementSize(uint64 type, const char* value) { if (!value) return 0; // Size of EBML ID - uint64_t ebml_size = static_cast(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize ebml_size += strlen(value); // Size of Datasize - ebml_size++; + ebml_size += GetCodedUIntSize(strlen(value)); return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) { +uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) { if (!value) return 0; // Size of EBML ID - uint64_t ebml_size = static_cast(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize ebml_size += size; @@ -308,9 +310,9 @@ uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) { return ebml_size; } -uint64_t EbmlDateElementSize(uint64_t type) { +uint64 EbmlDateElementSize(uint64 type) { // Size of EBML ID - uint64_t ebml_size = static_cast(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize ebml_size += kDateElementSize; @@ -321,18 +323,18 @@ uint64_t EbmlDateElementSize(uint64_t type) { return ebml_size; } -int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) { +int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) { if (!writer || size < 1 || size > 8) return -1; - for (int32_t i = 1; i <= size; ++i) { - const int32_t byte_count = size - i; - const int32_t bit_count = byte_count * 8; + for (int32 i = 1; i <= size; ++i) { + const int32 byte_count = size - i; + const int32 bit_count = byte_count * 8; - const int64_t bb = value >> bit_count; - const uint8_t b = static_cast(bb); + const int64 bb = value >> bit_count; + const uint8 b = static_cast(bb); - const int32_t status = writer->Write(&b, 1); + const int32 status = writer->Write(&b, 1); if (status < 0) return status; @@ -341,26 +343,26 @@ int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) { return 0; } -int32_t SerializeFloat(IMkvWriter* writer, float f) { +int32 SerializeFloat(IMkvWriter* writer, float f) { if (!writer) return -1; - assert(sizeof(uint32_t) == sizeof(float)); + assert(sizeof(uint32) == sizeof(float)); // This union is merely used to avoid a reinterpret_cast from float& to // uint32& which will result in violation of strict aliasing. union U32 { - uint32_t u32; + uint32 u32; float f; } value; value.f = f; - for (int32_t i = 1; i <= 4; ++i) { - const int32_t byte_count = 4 - i; - const int32_t bit_count = byte_count * 8; + for (int32 i = 1; i <= 4; ++i) { + const int32 byte_count = 4 - i; + const int32 bit_count = byte_count * 8; - const uint8_t byte = static_cast(value.u32 >> bit_count); + const uint8 byte = static_cast(value.u32 >> bit_count); - const int32_t status = writer->Write(&byte, 1); + const int32 status = writer->Write(&byte, 1); if (status < 0) return status; @@ -369,21 +371,21 @@ int32_t SerializeFloat(IMkvWriter* writer, float f) { return 0; } -int32_t WriteUInt(IMkvWriter* writer, uint64_t value) { +int32 WriteUInt(IMkvWriter* writer, uint64 value) { if (!writer) return -1; - int32_t size = GetCodedUIntSize(value); + int32 size = GetCodedUIntSize(value); return WriteUIntSize(writer, value, size); } -int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) { +int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) { if (!writer || size < 0 || size > 8) return -1; if (size > 0) { - const uint64_t bit = 1LL << (size * 7); + const uint64 bit = 1LL << (size * 7); if (value > (bit - 2)) return -1; @@ -391,11 +393,11 @@ int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) { value |= bit; } else { size = 1; - int64_t bit; + int64 bit; for (;;) { bit = 1LL << (size * 7); - const uint64_t max = bit - 2; + const uint64 max = bit - 2; if (value <= max) break; @@ -412,18 +414,18 @@ int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) { return SerializeInt(writer, value, size); } -int32_t WriteID(IMkvWriter* writer, uint64_t type) { +int32 WriteID(IMkvWriter* writer, uint64 type) { if (!writer) return -1; writer->ElementStartNotify(type, writer->Position()); - const int32_t size = GetUIntSize(type); + const int32 size = GetUIntSize(type); return SerializeInt(writer, type, size); } -bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) { +bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) { if (!writer) return false; @@ -436,19 +438,19 @@ bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) { return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) { return WriteEbmlElement(writer, type, value, 0); } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value, - uint64_t fixed_size) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value, + uint64 fixed_size) { if (!writer) return false; if (WriteID(writer, type)) return false; - uint64_t size = static_cast(GetUIntSize(value)); + uint64 size = GetUIntSize(value); if (fixed_size > 0) { if (size > fixed_size) return false; @@ -457,30 +459,30 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value, if (WriteUInt(writer, size)) return false; - if (SerializeInt(writer, value, static_cast(size))) + if (SerializeInt(writer, value, static_cast(size))) return false; return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) { if (!writer) return false; if (WriteID(writer, type)) return 0; - const uint64_t size = GetIntSize(value); + const uint64 size = GetIntSize(value); if (WriteUInt(writer, size)) return false; - if (SerializeInt(writer, value, static_cast(size))) + if (SerializeInt(writer, value, static_cast(size))) return false; return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) { if (!writer) return false; @@ -496,25 +498,25 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) { return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) { if (!writer || !value) return false; if (WriteID(writer, type)) return false; - const uint64_t length = strlen(value); + const uint64 length = strlen(value); if (WriteUInt(writer, length)) return false; - if (writer->Write(value, static_cast(length))) + if (writer->Write(value, static_cast(length))) return false; return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value, - uint64_t size) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value, + uint64 size) { if (!writer || !value || size < 1) return false; @@ -524,13 +526,13 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value, if (WriteUInt(writer, size)) return false; - if (writer->Write(value, static_cast(size))) + if (writer->Write(value, static_cast(size))) return false; return true; } -bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) { +bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) { if (!writer) return false; @@ -546,8 +548,8 @@ bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) { return true; } -uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame, - Cluster* cluster) { +uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame, + Cluster* cluster) { if (!writer || !frame || !frame->IsValid() || !cluster || !cluster->timecode_scale()) return 0; @@ -556,7 +558,7 @@ uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame, // timecode for the cluster itself (remember that block timecode // is a signed, 16-bit integer). However, as a simplification we // only permit non-negative cluster-relative timecodes for blocks. - const int64_t relative_timecode = cluster->GetRelativeTimecode( + const int64 relative_timecode = cluster->GetRelativeTimecode( frame->timestamp() / cluster->timecode_scale()); if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode) return 0; @@ -567,20 +569,19 @@ uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame, cluster->timecode_scale()); } -uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) { +uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { if (!writer) return false; // Subtract one for the void ID and the coded size. - uint64_t void_entry_size = size - 1 - GetCodedUIntSize(size - 1); - uint64_t void_size = - EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) + - void_entry_size; + uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1); + uint64 void_size = EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) + + void_entry_size; if (void_size != size) return 0; - const int64_t payload_position = writer->Position(); + const int64 payload_position = writer->Position(); if (payload_position < 0) return 0; @@ -590,30 +591,29 @@ uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) { if (WriteUInt(writer, void_entry_size)) return 0; - const uint8_t value = 0; - for (int32_t i = 0; i < static_cast(void_entry_size); ++i) { + const uint8 value = 0; + for (int32 i = 0; i < static_cast(void_entry_size); ++i) { if (writer->Write(&value, 1)) return 0; } - const int64_t stop_position = writer->Position(); + const int64 stop_position = writer->Position(); if (stop_position < 0 || - stop_position - payload_position != static_cast(void_size)) + stop_position - payload_position != static_cast(void_size)) return 0; return void_size; } -void GetVersion(int32_t* major, int32_t* minor, int32_t* build, - int32_t* revision) { +void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { *major = 0; *minor = 2; *build = 1; *revision = 0; } -uint64_t MakeUID(unsigned int* seed) { - uint64_t uid = 0; +uint64 MakeUID(unsigned int* seed) { + uint64 uid = 0; #ifdef __MINGW32__ srand(*seed); @@ -625,21 +625,22 @@ uint64_t MakeUID(unsigned int* seed) { // TODO(fgalligan): Move random number generation to platform specific code. #ifdef _MSC_VER (void)seed; - const int32_t nn = rand(); + const int32 nn = rand(); #elif __ANDROID__ - int32_t temp_num = 1; + (void)seed; + int32 temp_num = 1; int fd = open("/dev/urandom", O_RDONLY); if (fd != -1) { read(fd, &temp_num, sizeof(temp_num)); close(fd); } - const int32_t nn = temp_num; + const int32 nn = temp_num; #elif defined __MINGW32__ - const int32_t nn = rand(); + const int32 nn = rand(); #else - const int32_t nn = rand_r(seed); + const int32 nn = rand_r(seed); #endif - const int32_t n = 0xFF & (nn >> 4); // throw away low-order bits + const int32 n = 0xFF & (nn >> 4); // throw away low-order bits uid |= n; } @@ -647,4 +648,97 @@ uint64_t MakeUID(unsigned int* seed) { return uid; } +bool IsMatrixCoefficientsValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kGbr: + case mkvmuxer::Colour::kBt709: + case mkvmuxer::Colour::kUnspecifiedMc: + case mkvmuxer::Colour::kReserved: + case mkvmuxer::Colour::kFcc: + case mkvmuxer::Colour::kBt470bg: + case mkvmuxer::Colour::kSmpte170MMc: + case mkvmuxer::Colour::kSmpte240MMc: + case mkvmuxer::Colour::kYcocg: + case mkvmuxer::Colour::kBt2020NonConstantLuminance: + case mkvmuxer::Colour::kBt2020ConstantLuminance: + return true; + } + return false; +} + +bool IsChromaSitingHorzValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCsh: + case mkvmuxer::Colour::kLeftCollocated: + case mkvmuxer::Colour::kHalfCsh: + return true; + } + return false; +} + +bool IsChromaSitingVertValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCsv: + case mkvmuxer::Colour::kTopCollocated: + case mkvmuxer::Colour::kHalfCsv: + return true; + } + return false; +} + +bool IsColourRangeValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCr: + case mkvmuxer::Colour::kBroadcastRange: + case mkvmuxer::Colour::kFullRange: + case mkvmuxer::Colour::kMcTcDefined: + return true; + } + return false; +} + +bool IsTransferCharacteristicsValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kIturBt709Tc: + case mkvmuxer::Colour::kUnspecifiedTc: + case mkvmuxer::Colour::kReservedTc: + case mkvmuxer::Colour::kGamma22Curve: + case mkvmuxer::Colour::kGamma28Curve: + case mkvmuxer::Colour::kSmpte170MTc: + case mkvmuxer::Colour::kSmpte240MTc: + case mkvmuxer::Colour::kLinear: + case mkvmuxer::Colour::kLog: + case mkvmuxer::Colour::kLogSqrt: + case mkvmuxer::Colour::kIec6196624: + case mkvmuxer::Colour::kIturBt1361ExtendedColourGamut: + case mkvmuxer::Colour::kIec6196621: + case mkvmuxer::Colour::kIturBt202010bit: + case mkvmuxer::Colour::kIturBt202012bit: + case mkvmuxer::Colour::kSmpteSt2084: + case mkvmuxer::Colour::kSmpteSt4281Tc: + case mkvmuxer::Colour::kAribStdB67Hlg: + return true; + } + return false; +} + +bool IsPrimariesValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kReservedP0: + case mkvmuxer::Colour::kIturBt709P: + case mkvmuxer::Colour::kUnspecifiedP: + case mkvmuxer::Colour::kReservedP3: + case mkvmuxer::Colour::kIturBt470M: + case mkvmuxer::Colour::kIturBt470Bg: + case mkvmuxer::Colour::kSmpte170MP: + case mkvmuxer::Colour::kSmpte240MP: + case mkvmuxer::Colour::kFilm: + case mkvmuxer::Colour::kIturBt2020: + case mkvmuxer::Colour::kSmpteSt4281P: + case mkvmuxer::Colour::kJedecP22Phosphors: + return true; + } + return false; +} + } // namespace mkvmuxer diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h index 0e21a2dcbe..132388da59 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h @@ -8,87 +8,104 @@ #ifndef MKVMUXER_MKVMUXERUTIL_H_ #define MKVMUXER_MKVMUXERUTIL_H_ -#include +#include "mkvmuxertypes.h" + +#include "stdint.h" namespace mkvmuxer { class Cluster; class Frame; class IMkvWriter; -const uint64_t kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL; -const int64_t kMaxBlockTimecode = 0x07FFFLL; +// TODO(tomfinegan): mkvmuxer:: integer types continue to be used here because +// changing them causes pain for downstream projects. It would be nice if a +// solution that allows removal of the mkvmuxer:: integer types while avoiding +// pain for downstream users of libwebm. Considering that mkvmuxerutil.{cc,h} +// are really, for the great majority of cases, EBML size calculation and writer +// functions, perhaps a more EBML focused utility would be the way to go as a +// first step. + +const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL; +const int64 kMaxBlockTimecode = 0x07FFFLL; // Writes out |value| in Big Endian order. Returns 0 on success. -int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size); +int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size); // Returns the size in bytes of the element. -int32_t GetUIntSize(uint64_t value); -int32_t GetIntSize(int64_t value); -int32_t GetCodedUIntSize(uint64_t value); -uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value); -uint64_t EbmlElementSize(uint64_t type, int64_t value); -uint64_t EbmlElementSize(uint64_t type, uint64_t value); -uint64_t EbmlElementSize(uint64_t type, float value); -uint64_t EbmlElementSize(uint64_t type, const char* value); -uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size); -uint64_t EbmlDateElementSize(uint64_t type); +int32 GetUIntSize(uint64 value); +int32 GetIntSize(int64 value); +int32 GetCodedUIntSize(uint64 value); +uint64 EbmlMasterElementSize(uint64 type, uint64 value); +uint64 EbmlElementSize(uint64 type, int64 value); +uint64 EbmlElementSize(uint64 type, uint64 value); +uint64 EbmlElementSize(uint64 type, float value); +uint64 EbmlElementSize(uint64 type, const char* value); +uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size); +uint64 EbmlDateElementSize(uint64 type); // Returns the size in bytes of the element assuming that the element was // written using |fixed_size| bytes. If |fixed_size| is set to zero, then it // computes the necessary number of bytes based on |value|. -uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size); +uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size); // Creates an EBML coded number from |value| and writes it out. The size of // the coded number is determined by the value of |value|. |value| must not // be in a coded form. Returns 0 on success. -int32_t WriteUInt(IMkvWriter* writer, uint64_t value); +int32 WriteUInt(IMkvWriter* writer, uint64 value); // Creates an EBML coded number from |value| and writes it out. The size of // the coded number is determined by the value of |size|. |value| must not // be in a coded form. Returns 0 on success. -int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size); +int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size); // Output an Mkv master element. Returns true if the element was written. -bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t value, uint64_t size); +bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size); // Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the // ID to |SerializeInt|. Returns 0 on success. -int32_t WriteID(IMkvWriter* writer, uint64_t type); +int32 WriteID(IMkvWriter* writer, uint64 type); // Output an Mkv non-master element. Returns true if the element was written. -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value); -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value); -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value); -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value); -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value, - uint64_t size); -bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value, + uint64 size); +bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value); // Output an Mkv non-master element using fixed size. The element will be // written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero // then it computes the necessary number of bytes based on |value|. Returns true // if the element was written. -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value, - uint64_t fixed_size); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value, + uint64 fixed_size); // Output a Mkv Frame. It decides the correct element to write (Block vs // SimpleBlock) based on the parameters of the Frame. -uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame, - Cluster* cluster); +uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame, + Cluster* cluster); // Output a void element. |size| must be the entire size in bytes that will be // void. The function will calculate the size of the void header and subtract // it from |size|. -uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size); +uint64 WriteVoidElement(IMkvWriter* writer, uint64 size); // Returns the version number of the muxer in |major|, |minor|, |build|, // and |revision|. -void GetVersion(int32_t* major, int32_t* minor, int32_t* build, - int32_t* revision); +void GetVersion(int32* major, int32* minor, int32* build, int32* revision); // Returns a random number to be used for UID, using |seed| to seed // the random-number generator (see POSIX rand_r() for semantics). -uint64_t MakeUID(unsigned int* seed); +uint64 MakeUID(unsigned int* seed); + +// Colour field validation helpers. All return true when |value| is valid. +bool IsMatrixCoefficientsValueValid(uint64_t value); +bool IsChromaSitingHorzValueValid(uint64_t value); +bool IsChromaSitingVertValueValid(uint64_t value); +bool IsColourRangeValueValid(uint64_t value); +bool IsTransferCharacteristicsValueValid(uint64_t value); +bool IsPrimariesValueValid(uint64_t value); } // namespace mkvmuxer diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc index ca48e149c6..84655d802a 100644 --- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc +++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc @@ -8,6 +8,8 @@ #include "mkvmuxer/mkvwriter.h" +#include + #ifdef _MSC_VER #include // for _SH_DENYWR #endif @@ -77,7 +79,7 @@ int32 MkvWriter::Position(int64 position) { #ifdef _MSC_VER return _fseeki64(file_, position, SEEK_SET); #else - return fseek(file_, position, SEEK_SET); + return fseeko(file_, static_cast(position), SEEK_SET); #endif } diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc index 21801154d9..37f230d0a9 100644 --- a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc +++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc @@ -22,9 +22,15 @@ #include "common/webmids.h" +// disable deprecation warnings for auto_ptr +#if defined(__GNUC__) && __GNUC__ >= 5 +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + namespace mkvparser { const float MasteringMetadata::kValueNotPresent = FLT_MAX; const long long Colour::kValueNotPresent = LLONG_MAX; +const float Projection::kValueNotPresent = FLT_MAX; #ifdef MSC_COMPAT inline bool isnan(double val) { return !!_isnan(val); } @@ -1475,6 +1481,8 @@ long Segment::Load() { } } +SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {} + SeekHead::SeekHead(Segment* pSegment, long long start, long long size_, long long element_start, long long element_size) : m_pSegment(pSegment), @@ -1525,15 +1533,19 @@ long SeekHead::Parse() { if (pos != stop) return E_FILE_FORMAT_INVALID; - m_entries = new (std::nothrow) Entry[entry_count]; + if (entry_count > 0) { + m_entries = new (std::nothrow) Entry[entry_count]; - if (m_entries == NULL) - return -1; + if (m_entries == NULL) + return -1; + } - m_void_elements = new (std::nothrow) VoidElement[void_element_count]; + if (void_element_count > 0) { + m_void_elements = new (std::nothrow) VoidElement[void_element_count]; - if (m_void_elements == NULL) - return -1; + if (m_void_elements == NULL) + return -1; + } // now parse the entries and void elements @@ -1552,14 +1564,14 @@ long SeekHead::Parse() { if (status < 0) // error return status; - if (id == libwebm::kMkvSeek) { + if (id == libwebm::kMkvSeek && entry_count > 0) { if (ParseEntry(pReader, pos, size, pEntry)) { Entry& e = *pEntry++; e.element_start = idpos; e.element_size = (pos + size) - idpos; } - } else if (id == libwebm::kMkvVoid) { + } else if (id == libwebm::kMkvVoid && void_element_count > 0) { VoidElement& e = *pVoidElement++; e.element_start = idpos; @@ -1766,18 +1778,7 @@ bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_, if ((pos + seekIdSize) > stop) return false; - // Note that the SeekId payload really is serialized - // as a "Matroska integer", not as a plain binary value. - // In fact, Matroska requires that ID values in the - // stream exactly match the binary representation as listed - // in the Matroska specification. - // - // This parser is more liberal, and permits IDs to have - // any width. (This could make the representation in the stream - // different from what's in the spec, but it doesn't matter here, - // since we always normalize "Matroska integer" values.) - - pEntry->id = ReadUInt(pReader, pos, len); // payload + pEntry->id = ReadID(pReader, pos, len); // payload if (pEntry->id <= 0) return false; @@ -2434,7 +2435,9 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, } const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const { - assert(pTrack); + if (pTrack == NULL) { + return NULL; + } const long long n = pTrack->GetNumber(); @@ -4034,7 +4037,7 @@ long SegmentInfo::Parse() { } const double rollover_check = m_duration * m_timecodeScale; - if (rollover_check > LLONG_MAX) + if (rollover_check > static_cast(LLONG_MAX)) return E_FILE_FORMAT_INVALID; if (pos != stop) @@ -4125,7 +4128,7 @@ ContentEncoding::~ContentEncoding() { } const ContentEncoding::ContentCompression* - ContentEncoding::GetCompressionByIndex(unsigned long idx) const { +ContentEncoding::GetCompressionByIndex(unsigned long idx) const { const ptrdiff_t count = compression_entries_end_ - compression_entries_; assert(count >= 0); @@ -4983,29 +4986,27 @@ bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos, if (!reader) return false; - std::auto_ptr chromaticity_ptr; + if (!*chromaticity) + *chromaticity = new PrimaryChromaticity(); - if (!*chromaticity) { - chromaticity_ptr.reset(new PrimaryChromaticity()); - } else { - chromaticity_ptr.reset(*chromaticity); - } - - if (!chromaticity_ptr.get()) + if (!*chromaticity) return false; - float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y; + PrimaryChromaticity* pc = *chromaticity; + float* value = is_x ? &pc->x : &pc->y; double parser_value = 0; - const long long value_parse_status = + const long long parse_status = UnserializeFloat(reader, read_pos, value_size, parser_value); + // Valid range is [0, 1]. Make sure the double is representable as a float + // before casting. + if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 || + (parser_value > 0.0 && parser_value < FLT_MIN)) + return false; + *value = static_cast(parser_value); - if (value_parse_status < 0 || *value < 0.0 || *value > 1.0) - return false; - - *chromaticity = chromaticity_ptr.release(); return true; } @@ -5188,11 +5189,94 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start, return true; } +bool Projection::Parse(IMkvReader* reader, long long start, long long size, + Projection** projection) { + if (!reader || *projection) + return false; + + std::auto_ptr projection_ptr(new Projection()); + if (!projection_ptr.get()) + return false; + + const long long end = start + size; + long long read_pos = start; + + while (read_pos < end) { + long long child_id = 0; + long long child_size = 0; + + const long long status = + ParseElementHeader(reader, read_pos, end, child_id, child_size); + if (status < 0) + return false; + + if (child_id == libwebm::kMkvProjectionType) { + long long projection_type = kTypeNotPresent; + projection_type = UnserializeUInt(reader, read_pos, child_size); + if (projection_type < 0) + return false; + + projection_ptr->type = static_cast(projection_type); + } else if (child_id == libwebm::kMkvProjectionPrivate) { + unsigned char* data = SafeArrayAlloc(1, child_size); + + if (data == NULL) + return false; + + const int status = + reader->Read(read_pos, static_cast(child_size), data); + + if (status) { + delete[] data; + return false; + } + + projection_ptr->private_data = data; + projection_ptr->private_data_length = static_cast(child_size); + } else { + double value = 0; + const long long value_parse_status = + UnserializeFloat(reader, read_pos, child_size, value); + // Make sure value is representable as a float before casting. + if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } + + switch (child_id) { + case libwebm::kMkvProjectionPoseYaw: + projection_ptr->pose_yaw = static_cast(value); + break; + case libwebm::kMkvProjectionPosePitch: + projection_ptr->pose_pitch = static_cast(value); + break; + case libwebm::kMkvProjectionPoseRoll: + projection_ptr->pose_roll = static_cast(value); + break; + default: + return false; + } + } + + read_pos += child_size; + if (read_pos > end) + return false; + } + + *projection = projection_ptr.release(); + return true; +} + VideoTrack::VideoTrack(Segment* pSegment, long long element_start, long long element_size) - : Track(pSegment, element_start, element_size), m_colour(NULL) {} + : Track(pSegment, element_start, element_size), + m_colour(NULL), + m_projection(NULL) {} -VideoTrack::~VideoTrack() { delete m_colour; } +VideoTrack::~VideoTrack() { + delete m_colour; + delete m_projection; +} long VideoTrack::Parse(Segment* pSegment, const Info& info, long long element_start, long long element_size, @@ -5224,6 +5308,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, const long long stop = pos + s.size; Colour* colour = NULL; + Projection* projection = NULL; while (pos < stop) { long long id, size; @@ -5274,6 +5359,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, } else if (id == libwebm::kMkvColour) { if (!Colour::Parse(pReader, pos, size, &colour)) return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvProjection) { + if (!Projection::Parse(pReader, pos, size, &projection)) + return E_FILE_FORMAT_INVALID; } pos += size; // consume payload @@ -5305,6 +5393,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, pTrack->m_stereo_mode = stereo_mode; pTrack->m_rate = rate; pTrack->m_colour = colour; + pTrack->m_projection = projection; pResult = pTrack; return 0; // success @@ -5405,6 +5494,8 @@ long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const { Colour* VideoTrack::GetColour() const { return m_colour; } +Projection* VideoTrack::GetProjection() const { return m_projection; } + long long VideoTrack::GetWidth() const { return m_width; } long long VideoTrack::GetHeight() const { return m_height; } @@ -6698,8 +6789,10 @@ Cluster::Cluster(Segment* pSegment, long idx, long long element_start {} Cluster::~Cluster() { - if (m_entries_count <= 0) + if (m_entries_count <= 0) { + delete[] m_entries; return; + } BlockEntry** i = m_entries; BlockEntry** const j = m_entries + m_entries_count; @@ -7850,7 +7943,6 @@ long Block::Parse(const Cluster* pCluster) { pf = m_frames; while (pf != pf_end) { Frame& f = *pf++; - assert((pos + f.len) <= stop); if ((pos + f.len) > stop) return E_FILE_FORMAT_INVALID; diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h index 42e6e88ab4..26c2b7e5eb 100644 --- a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h +++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h @@ -473,6 +473,34 @@ struct Colour { MasteringMetadata* mastering_metadata; }; +struct Projection { + enum ProjectionType { + kTypeNotPresent = -1, + kRectangular = 0, + kEquirectangular = 1, + kCubeMap = 2, + kMesh = 3, + }; + static const float kValueNotPresent; + Projection() + : type(kTypeNotPresent), + private_data(NULL), + private_data_length(0), + pose_yaw(kValueNotPresent), + pose_pitch(kValueNotPresent), + pose_roll(kValueNotPresent) {} + ~Projection() { delete[] private_data; } + static bool Parse(IMkvReader* reader, long long element_start, + long long element_size, Projection** projection); + + ProjectionType type; + unsigned char* private_data; + size_t private_data_length; + float pose_yaw; + float pose_pitch; + float pose_roll; +}; + class VideoTrack : public Track { VideoTrack(const VideoTrack&); VideoTrack& operator=(const VideoTrack&); @@ -497,6 +525,8 @@ class VideoTrack : public Track { Colour* GetColour() const; + Projection* GetProjection() const; + private: long long m_width; long long m_height; @@ -508,6 +538,7 @@ class VideoTrack : public Track { double m_rate; Colour* m_colour; + Projection* m_projection; }; class AudioTrack : public Track { @@ -813,6 +844,8 @@ class SeekHead { long Parse(); struct Entry { + Entry(); + // the SeekHead entry payload long long id; long long pos; diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc index 9f90d8c4f8..23d68f5089 100644 --- a/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc +++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc @@ -7,6 +7,8 @@ // be found in the AUTHORS file in the root of the source tree. #include "mkvparser/mkvreader.h" +#include + #include namespace mkvparser { @@ -117,7 +119,7 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) { if (status) return -1; // error #else - fseek(m_file, offset, SEEK_SET); + fseeko(m_file, static_cast(offset), SEEK_SET); #endif const size_t size = fread(buffer, 1, len, m_file); @@ -128,4 +130,4 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) { return 0; // success } -} // namespace mkvparser \ No newline at end of file +} // namespace mkvparser diff --git a/libs/libvpx/tools.mk b/libs/libvpx/tools.mk new file mode 100644 index 0000000000..1d005b2acf --- /dev/null +++ b/libs/libvpx/tools.mk @@ -0,0 +1,115 @@ +## +## Copyright (c) 2016 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# List of tools to build. +TOOLS-yes += tiny_ssim.c +tiny_ssim.SRCS += vpx/vpx_integer.h y4minput.c y4minput.h \ + vpx/vpx_codec.h vpx/src/vpx_image.c +tiny_ssim.SRCS += vpx_mem/vpx_mem.c vpx_mem/vpx_mem.h +tiny_ssim.SRCS += vpx_dsp/ssim.h vpx_scale/yv12config.h +tiny_ssim.SRCS += vpx_ports/mem.h vpx_ports/mem.h +tiny_ssim.SRCS += vpx_mem/include/vpx_mem_intrnl.h +tiny_ssim.GUID = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4 +tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files + +# +# End of specified files. The rest of the build rules should happen +# automagically from here. +# + + +# Expand list of selected tools to build (as specified above) +TOOLS = $(addprefix tools/,$(call enabled,TOOLS)) +ALL_SRCS = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS)) +CFLAGS += -I../include + +ifneq ($(CONFIG_CODEC_SRCS), yes) + CFLAGS += -I../include/vpx +endif + +# Expand all tools sources into a variable containing all sources +# for that tools (not just them main one specified in TOOLS) +# and add this file to the list (for MSVS workspace generation) +$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk)) + + +# Create build/install dependencies for all tools. The common case +# is handled here. The MSVS case is handled below. +NOT_MSVS = $(if $(CONFIG_MSVS),,yes) +DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX))) +DIST-SRCS-yes += $(ALL_SRCS) +OBJS-$(NOT_MSVS) += $(call objs,$(ALL_SRCS)) +BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX))) + +# Instantiate linker template for all tools. +$(foreach bin,$(BINS-yes),\ + $(eval $(bin):)\ + $(eval $(call linker_template,$(bin),\ + $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) -lm))) + +# The following pairs define a mapping of locations in the distribution +# tree to locations in the source/build trees. +INSTALL_MAPS += src/%.c %.c +INSTALL_MAPS += src/% $(SRC_PATH_BARE)/% +INSTALL_MAPS += bin/% % +INSTALL_MAPS += % % + + +# Build Visual Studio Projects. We use a template here to instantiate +# explicit rules rather than using an implicit rule because we want to +# leverage make's VPATH searching rather than specifying the paths on +# each file in TOOLS. This has the unfortunate side effect that +# touching the source files trigger a rebuild of the project files +# even though there is no real dependency there (the dependency is on +# the makefiles). We may want to revisit this. +define vcproj_template +$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX) + $(if $(quiet),@echo " [vcproj] $$@") + $(qexec)$$(GEN_VCPROJ)\ + --exe\ + --target=$$(TOOLCHAIN)\ + --name=$$(@:.$(VCPROJ_SFX)=)\ + --ver=$$(CONFIG_VS_VERSION)\ + --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\ + --src-path-bare="$(SRC_PATH_BARE)" \ + $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ + $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^ +endef +TOOLS_BASENAME := $(notdir $(TOOLS)) +PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX)) +INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\ + $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe))) +$(foreach proj,$(call enabled,PROJECTS),\ + $(eval $(call vcproj_template,$(proj)))) + +# +# Documentation Rules +# +%.dox: %.c + @echo " [DOXY] $@" + @mkdir -p $(dir $@) + @echo "/*!\page tools_$(@F:.dox=) $(@F:.dox=)" > $@ + @echo " \includelineno $(> $@ + @echo "*/" >> $@ + +tools.dox: tools.mk + @echo " [DOXY] $@" + @echo "/*!\page tools Tools" > $@ + @echo " This SDK includes a number of tools/utilities."\ + "The following tools are included: ">>$@ + @$(foreach ex,$(sort $(notdir $(TOOLS:.c=))),\ + echo " - \subpage tools_$(ex) $($(ex).DESCRIPTION)" >> $@;) + @echo "*/" >> $@ + +CLEAN-OBJS += tools.doxy tools.dox $(TOOLS:.c=.dox) +DOCS-yes += tools.doxy tools.dox +tools.doxy: tools.dox $(TOOLS:.c=.dox) + @echo "INPUT += $^" > $@ diff --git a/libs/libvpx/tools/all_builds.py b/libs/libvpx/tools/all_builds.py deleted file mode 100755 index d1f0c80c03..0000000000 --- a/libs/libvpx/tools/all_builds.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python - -import getopt -import subprocess -import sys - -LONG_OPTIONS = ["shard=", "shards="] -BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental" - -def RunCommand(command): - run = subprocess.Popen(command, shell=True) - output = run.communicate() - if run.returncode: - print "Non-zero return code: " + str(run.returncode) + " => exiting!" - sys.exit(1) - -def list_of_experiments(): - experiments = [] - configure_file = open("configure") - list_start = False - for line in configure_file.read().split("\n"): - if line == 'EXPERIMENT_LIST="': - list_start = True - elif line == '"': - list_start = False - elif list_start: - currently_broken = ["csm"] - experiment = line[4:] - if experiment not in currently_broken: - experiments.append(experiment) - return experiments - -def main(argv): - # Parse arguments - options = {"--shard": 0, "--shards": 1} - if "--" in argv: - opt_end_index = argv.index("--") - else: - opt_end_index = len(argv) - try: - o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS) - except getopt.GetoptError, err: - print str(err) - print "Usage: %s [--shard= --shards=] -- [configure flag ...]"%argv[0] - sys.exit(2) - - options.update(o) - extra_args = argv[opt_end_index + 1:] - - # Shard experiment list - shard = int(options["--shard"]) - shards = int(options["--shards"]) - experiments = list_of_experiments() - base_command = " ".join([BASE_COMMAND] + extra_args) - configs = [base_command] - configs += ["%s --enable-%s" % (base_command, e) for e in experiments] - my_configs = zip(configs, range(len(configs))) - my_configs = filter(lambda x: x[1] % shards == shard, my_configs) - my_configs = [e[0] for e in my_configs] - - # Run configs for this shard - for config in my_configs: - test_build(config) - -def test_build(configure_command): - print "\033[34m\033[47mTesting %s\033[0m" % (configure_command) - RunCommand(configure_command) - RunCommand("make clean") - RunCommand("make") - -if __name__ == "__main__": - main(sys.argv) diff --git a/libs/libvpx/tools/author_first_release.sh b/libs/libvpx/tools/author_first_release.sh deleted file mode 100755 index 7b0b797212..0000000000 --- a/libs/libvpx/tools/author_first_release.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -## -## List the release each author first contributed to. -## -## Usage: author_first_release.sh [TAGS] -## -## If the TAGS arguments are unspecified, all tags reported by `git tag` -## will be considered. -## -tags=${@:-$(git tag)} -for tag in $tags; do - git shortlog -n -e -s $tag | - cut -f2- | - awk "{print \"${tag#v}\t\"\$0}" -done | sort -k2 | uniq -f2 diff --git a/libs/libvpx/tools/ftfy.sh b/libs/libvpx/tools/ftfy.sh deleted file mode 100755 index c005918fe7..0000000000 --- a/libs/libvpx/tools/ftfy.sh +++ /dev/null @@ -1,158 +0,0 @@ -#!/bin/sh -self="$0" -dirname_self=$(dirname "$self") - -usage() { - cat <&2 -Usage: $self [option] - -This script applies a whitespace transformation to the commit at HEAD. If no -options are given, then the modified files are left in the working tree. - -Options: - -h, --help Shows this message - -n, --dry-run Shows a diff of the changes to be made. - --amend Squashes the changes into the commit at HEAD - This option will also reformat the commit message. - --commit Creates a new commit containing only the whitespace changes - --msg-only Reformat the commit message only, ignore the patch itself. - -EOF - rm -f ${CLEAN_FILES} - exit 1 -} - - -log() { - echo "${self##*/}: $@" >&2 -} - - -vpx_style() { - for f; do - case "$f" in - *.h|*.c|*.cc) - clang-format -i --style=file "$f" - ;; - esac - done -} - - -apply() { - [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1" -} - - -commit() { - LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}') - if [ -z "$LAST_CHANGEID" ]; then - log "HEAD doesn't have a Change-Id, unable to generate a new commit" - exit 1 - fi - - # Build a deterministic Change-Id from the parent's - NEW_CHANGEID=${LAST_CHANGEID}-styled - NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin) - - # Commit, preserving authorship from the parent commit. - git commit -a -C HEAD > /dev/null - git commit --amend -F- << EOF -Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9} - -Change-Id: ${NEW_CHANGEID} -EOF -} - - -show_commit_msg_diff() { - if [ $DIFF_MSG_RESULT -ne 0 ]; then - log "Modified commit message:" - diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3 - fi -} - - -amend() { - show_commit_msg_diff - if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then - git commit -a --amend -F "$NEW_COMMIT_MSG" - fi -} - - -diff_msg() { - git log -1 --format=%B > "$ORIG_COMMIT_MSG" - "${dirname_self}"/wrap-commit-msg.py \ - < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG" - cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" - DIFF_MSG_RESULT=$? -} - - -# Temporary files -ORIG_DIFF=orig.diff.$$ -MODIFIED_DIFF=modified.diff.$$ -FINAL_DIFF=final.diff.$$ -ORIG_COMMIT_MSG=orig.commit-msg.$$ -NEW_COMMIT_MSG=new.commit-msg.$$ -CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}" -CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}" - -# Preconditions -[ $# -lt 2 ] || usage - -if ! clang-format -version >/dev/null 2>&1; then - log "clang-format not found" - exit 1 -fi - -if ! git diff --quiet HEAD; then - log "Working tree is dirty, commit your changes first" - exit 1 -fi - -# Need to be in the root -cd "$(git rev-parse --show-toplevel)" - -# Collect the original diff -git show > "${ORIG_DIFF}" - -# Apply the style guide on new and modified files and collect its diff -for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do - case "$f" in - third_party/*) continue;; - esac - vpx_style "$f" -done -git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}" - -# Intersect the two diffs -"${dirname_self}"/intersect-diffs.py \ - "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}" -INTERSECT_RESULT=$? -git reset --hard >/dev/null - -# Fixup the commit message -diff_msg - -# Handle options -if [ -n "$1" ]; then - case "$1" in - -h|--help) usage;; - -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;; - --commit) apply "${FINAL_DIFF}"; commit;; - --amend) apply "${FINAL_DIFF}"; amend;; - --msg-only) amend;; - *) usage;; - esac -else - apply "${FINAL_DIFF}" - if ! git diff --quiet; then - log "Formatting changes applied, verify and commit." - log "See also: http://www.webmproject.org/code/contribute/conventions/" - git diff --stat - fi -fi - -rm -f ${CLEAN_FILES} diff --git a/libs/libvpx/tools/gen_authors.sh b/libs/libvpx/tools/gen_authors.sh index 4cfd81ec3f..f163f663af 100755 --- a/libs/libvpx/tools/gen_authors.sh +++ b/libs/libvpx/tools/gen_authors.sh @@ -6,7 +6,8 @@ cat <" | sort | uniq | grep -v corp.google) +$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google \ + | grep -v noreply) Google Inc. The Mozilla Foundation The Xiph.Org Foundation diff --git a/libs/libvpx/tools/tiny_ssim.c b/libs/libvpx/tools/tiny_ssim.c new file mode 100644 index 0000000000..5e8ca02b49 --- /dev/null +++ b/libs/libvpx/tools/tiny_ssim.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "./y4minput.h" +#include "vpx_dsp/ssim.h" +#include "vpx_ports/mem.h" + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 + +#if CONFIG_VP9_HIGHBITDEPTH +static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride, + uint16_t *recon, int recon_stride, + unsigned int cols, unsigned int rows) { + unsigned int row, col; + uint64_t total_sse = 0; + int diff; + + for (row = 0; row < rows; row++) { + for (col = 0; col < cols; col++) { + diff = orig[col] - recon[col]; + total_sse += diff * diff; + } + + orig += orig_stride; + recon += recon_stride; + } + return total_sse; +} +#endif +static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon, + int recon_stride, unsigned int cols, + unsigned int rows) { + unsigned int row, col; + uint64_t total_sse = 0; + int diff; + + for (row = 0; row < rows; row++) { + for (col = 0; col < cols; col++) { + diff = orig[col] - recon[col]; + total_sse += diff * diff; + } + + orig += orig_stride; + recon += recon_stride; + } + return total_sse; +} + +#define MAX_PSNR 100 +static double mse2psnr(double samples, double peak, double mse) { + double psnr; + + if (mse > 0.0) + psnr = 10.0 * log10(peak * peak * samples / mse); + else + psnr = MAX_PSNR; // Limit to prevent / 0 + + if (psnr > MAX_PSNR) psnr = MAX_PSNR; + + return psnr; +} + +typedef enum { RAW_YUV, Y4M } input_file_type; + +typedef struct input_file { + FILE *file; + input_file_type type; + unsigned char *buf; + y4m_input y4m; + vpx_image_t img; + int w; + int h; + int bit_depth; +} input_file_t; + +// Open a file and determine if its y4m or raw. If y4m get the header. +static int open_input_file(const char *file_name, input_file_t *input, int w, + int h, int bit_depth) { + char y4m_buf[4]; + size_t r1; + input->type = RAW_YUV; + input->buf = NULL; + input->file = strcmp(file_name, "-") ? fopen(file_name, "rb") : stdin; + if (input->file == NULL) return -1; + r1 = fread(y4m_buf, 1, 4, input->file); + if (r1 == 4) { + if (memcmp(y4m_buf, "YUV4", 4) == 0) input->type = Y4M; + switch (input->type) { + case Y4M: + y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0); + input->w = input->y4m.pic_w; + input->h = input->y4m.pic_h; + input->bit_depth = input->y4m.bit_depth; + // Y4M alloc's its own buf. Init this to avoid problems if we never + // read frames. + memset(&input->img, 0, sizeof(input->img)); + break; + case RAW_YUV: + fseek(input->file, 0, SEEK_SET); + input->w = w; + input->h = h; + if (bit_depth < 9) + input->buf = malloc(w * h * 3 / 2); + else + input->buf = malloc(w * h * 3); + break; + } + } + return 0; +} + +static void close_input_file(input_file_t *in) { + if (in->file) fclose(in->file); + if (in->type == Y4M) { + vpx_img_free(&in->img); + } else { + free(in->buf); + } +} + +static size_t read_input_file(input_file_t *in, unsigned char **y, + unsigned char **u, unsigned char **v, int bd) { + size_t r1 = 0; + switch (in->type) { + case Y4M: + r1 = y4m_input_fetch_frame(&in->y4m, in->file, &in->img); + *y = in->img.planes[0]; + *u = in->img.planes[1]; + *v = in->img.planes[2]; + break; + case RAW_YUV: + if (bd < 9) { + r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file); + *y = in->buf; + *u = in->buf + in->w * in->h; + *v = in->buf + 5 * in->w * in->h / 4; + } else { + r1 = fread(in->buf, in->w * in->h * 3, 1, in->file); + *y = in->buf; + *u = in->buf + in->w * in->h / 2; + *v = *u + in->w * in->h / 2; + } + break; + } + + return r1; +} + +void ssim_parms_16x16(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 16; i++, s += sp, r += rp) { + for (j = 0; j < 16; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count, + uint32_t bd) { + int64_t ssim_n, ssim_d; + int64_t c1 = 0, c2 = 0; + if (bd == 8) { + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + } else if (bd == 10) { + c1 = (cc1_10 * count * count) >> 12; + c2 = (cc2_10 * count * count) >> 12; + } else if (bd == 12) { + c1 = (cc1_12 * count * count) >> 12; + c2 = (cc2_12 * count * count) >> 12; + } else { + assert(0); + } + + ssim_n = (2 * sum_s * sum_r + c1) * + ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + + ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * + ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + + (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); +} + +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd, uint32_t shift) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, + int stride_img2, int width, int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height, uint32_t bd, uint32_t shift) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity +// +// Re working out the math -> +// +// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / +// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) +// +// mean(x) = sum(x) / n +// +// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) +// +// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) +// +// ssim(x,y) = +// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * +// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ +// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) +// +// factoring out n*n +// +// ssim(x,y) = +// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * +// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) +// +// Replace c1 with n*n * c1 for the final step that leads to this code: +// The final step scales by 12 bits so we don't lose precision in the constants. + +static double ssimv_similarity(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / + (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); + + // Since these variables are unsigned sums, convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} + +// The first term of the ssim metric is a luminance factor. +// +// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) +// +// This luminance factor is super sensitive to the dark side of luminance +// values and completely insensitive on the white side. check out 2 sets +// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 +// 2*250*252/ (250^2+252^2) => .99999997 +// +// As a result in this tweaked version of the calculation in which the +// luminance is taken as percentage off from peak possible. +// +// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count +// +static double ssimv_similarity2(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; + const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); + + // Since these variables are unsigned, sums convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} +static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, Ssimv *sv) { + ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, + &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); +} + +double get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency) { + double dssim_total = 0; + double ssim_total = 0; + double ssim2_total = 0; + double inconsistency_total = 0; + int i, j; + int c = 0; + double norm; + double old_ssim_total = 0; + + // We can sample points as frequently as we like start with 1 per 4x4. + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4, ++c) { + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; + double ssim; + double ssim2; + double dssim; + uint32_t var_new; + uint32_t var_old; + uint32_t mean_new; + uint32_t mean_old; + double ssim_new; + double ssim_old; + + // Not sure there's a great way to handle the edge pixels + // in ssim when using a window. Seems biased against edge pixels + // however you handle this. This uses only samples that are + // fully in the frame. + if (j + 8 <= width && i + 8 <= height) { + ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); + } + + ssim = ssimv_similarity(&sv, 64); + ssim2 = ssimv_similarity2(&sv, 64); + + sv.ssim = ssim2; + + // dssim is calculated to use as an actual error metric and + // is scaled up to the same range as sum square error. + // Since we are subsampling every 16th point maybe this should be + // *16 ? + dssim = 255 * 255 * (1 - ssim2) / 2; + + // Here I introduce a new error metric: consistency-weighted + // SSIM-inconsistency. This metric isolates frames where the + // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much + // sharper or blurrier than the others. Higher values indicate a + // temporally inconsistent SSIM. There are two ideas at work: + // + // 1) 'SSIM-inconsistency': the total inconsistency value + // reflects how much SSIM values are changing between this + // source / reference frame pair and the previous pair. + // + // 2) 'consistency-weighted': weights de-emphasize areas in the + // frame where the scene content has changed. Changes in scene + // content are detected via changes in local variance and local + // mean. + // + // Thus the overall measure reflects how inconsistent the SSIM + // values are, over consistent regions of the frame. + // + // The metric has three terms: + // + // term 1 -> uses change in scene Variance to weight error score + // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term 2 -> uses change in local scene luminance to weight error + // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term3 -> measures inconsistency in ssim scores between frames + // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). + // + // This term compares the ssim score for the same location in 2 + // subsequent frames. + var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; + var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; + mean_new = sv.sum_s; + mean_old = sv2[c].sum_s; + ssim_new = sv.ssim; + ssim_old = sv2[c].ssim; + + if (do_inconsistency) { + // We do the metric once for every 4x4 block in the image. Since + // we are scaling the error to SSE for use in a psnr calculation + // 1.0 = 4x4x255x255 the worst error we can possibly have. + static const double kScaling = 4. * 4 * 255 * 255; + + // The constants have to be non 0 to avoid potential divide by 0 + // issues other than that they affect kind of a weighting between + // the terms. No testing of what the right terms should be has been + // done. + static const double c1 = 1, c2 = 1, c3 = 1; + + // This measures how much consistent variance is in two consecutive + // source frames. 1.0 means they have exactly the same variance. + const double variance_term = + (2.0 * var_old * var_new + c1) / + (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); + + // This measures how consistent the local mean are between two + // consecutive frames. 1.0 means they have exactly the same mean. + const double mean_term = + (2.0 * mean_old * mean_new + c2) / + (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); + + // This measures how consistent the ssims of two + // consecutive frames is. 1.0 means they are exactly the same. + double ssim_term = + pow((2.0 * ssim_old * ssim_new + c3) / + (ssim_old * ssim_old + ssim_new * ssim_new + c3), + 5); + + double this_inconsistency; + + // Floating point math sometimes makes this > 1 by a tiny bit. + // We want the metric to scale between 0 and 1.0 so we can convert + // it to an snr scaled value. + if (ssim_term > 1) ssim_term = 1; + + // This converts the consistency metric to an inconsistency metric + // ( so we can scale it like psnr to something like sum square error. + // The reason for the variance and mean terms is the assumption that + // if there are big changes in the source we shouldn't penalize + // inconsistency in ssim scores a bit less as it will be less visible + // to the user. + this_inconsistency = (1 - ssim_term) * variance_term * mean_term; + + this_inconsistency *= kScaling; + inconsistency_total += this_inconsistency; + } + sv2[c] = sv; + ssim_total += ssim; + ssim2_total += ssim2; + dssim_total += dssim; + + old_ssim_total += ssim_old; + } + old_ssim_total += 0; + } + + norm = 1. / (width / 4) / (height / 4); + ssim_total *= norm; + ssim2_total *= norm; + m->ssim2 = ssim2_total; + m->ssim = ssim_total; + if (old_ssim_total == 0) inconsistency_total = 0; + + m->ssimc = inconsistency_total; + + m->dssim = dssim_total; + return inconsistency_total; +} + +double highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd) { + double a, b, c; + double ssimv; + uint32_t shift = 0; + + assert(bd >= in_bd); + shift = bd - in_bd; + + a = highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, source->y_crop_height, + in_bd, shift); + + b = highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + c = highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} + +int main(int argc, char *argv[]) { + FILE *framestats = NULL; + int bit_depth = 8; + int w = 0, h = 0, tl_skip = 0, tl_skips_remaining = 0; + double ssimavg = 0, ssimyavg = 0, ssimuavg = 0, ssimvavg = 0; + double psnrglb = 0, psnryglb = 0, psnruglb = 0, psnrvglb = 0; + double psnravg = 0, psnryavg = 0, psnruavg = 0, psnrvavg = 0; + double *ssimy = NULL, *ssimu = NULL, *ssimv = NULL; + uint64_t *psnry = NULL, *psnru = NULL, *psnrv = NULL; + size_t i, n_frames = 0, allocated_frames = 0; + int return_value = 0; + input_file_t in[2]; + double peak = 255.0; + + if (argc < 2) { + fprintf(stderr, + "Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}" + "[WxH tl_skip={0,1,3} frame_stats_file bits]\n", + argv[0]); + return_value = 1; + goto clean_up; + } + + if (argc > 3) { + sscanf(argv[3], "%dx%d", &w, &h); + } + + if (argc > 6) { + sscanf(argv[6], "%d", &bit_depth); + } + + if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) { + fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]); + goto clean_up; + } + + if (w == 0 && h == 0) { + // If a y4m is the first file and w, h is not set grab from first file. + w = in[0].w; + h = in[0].h; + bit_depth = in[0].bit_depth; + } + if (bit_depth == 10) peak = 1023.0; + + if (bit_depth == 12) peak = 4095; + + if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) { + fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]); + goto clean_up; + } + + if (in[0].w != in[1].w || in[0].h != in[1].h || in[0].w != w || + in[0].h != h || w == 0 || h == 0) { + fprintf(stderr, + "Failing: Image dimensions don't match or are unspecified!\n"); + return_value = 1; + goto clean_up; + } + + // Number of frames to skip from file1.yuv for every frame used. Normal values + // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding + // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding. + if (argc > 4) { + sscanf(argv[4], "%d", &tl_skip); + if (argc > 5) { + framestats = fopen(argv[5], "w"); + if (!framestats) { + fprintf(stderr, "Could not open \"%s\" for writing: %s\n", argv[5], + strerror(errno)); + return_value = 1; + goto clean_up; + } + } + } + + if (w & 1 || h & 1) { + fprintf(stderr, "Invalid size %dx%d\n", w, h); + return_value = 1; + goto clean_up; + } + + while (1) { + size_t r1, r2; + unsigned char *y[2], *u[2], *v[2]; + + r1 = read_input_file(&in[0], &y[0], &u[0], &v[0], bit_depth); + + if (r1) { + // Reading parts of file1.yuv that were not used in temporal layer. + if (tl_skips_remaining > 0) { + --tl_skips_remaining; + continue; + } + // Use frame, but skip |tl_skip| after it. + tl_skips_remaining = tl_skip; + } + + r2 = read_input_file(&in[1], &y[1], &u[1], &v[1], bit_depth); + + if (r1 && r2 && r1 != r2) { + fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno), + (int)r1, (int)r2); + return_value = 1; + goto clean_up; + } else if (r1 == 0 || r2 == 0) { + break; + } +#if CONFIG_VP9_HIGHBITDEPTH +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + if (bit_depth < 9) { \ + ssim = ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); \ + } else { \ + ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \ + w, w, h, bit_depth, bit_depth - 8); \ + psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w, \ + CAST_TO_SHORTPTR(buf1), w, w, h); \ + } +#else +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + ssim = ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); +#endif + + if (n_frames == allocated_frames) { + allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2; + ssimy = realloc(ssimy, allocated_frames * sizeof(*ssimy)); + ssimu = realloc(ssimu, allocated_frames * sizeof(*ssimu)); + ssimv = realloc(ssimv, allocated_frames * sizeof(*ssimv)); + psnry = realloc(psnry, allocated_frames * sizeof(*psnry)); + psnru = realloc(psnru, allocated_frames * sizeof(*psnru)); + psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv)); + } + psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h); + psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], w / 2, h / 2); + psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], w / 2, h / 2); + + n_frames++; + } + + if (framestats) { + fprintf(framestats, + "ssim,ssim-y,ssim-u,ssim-v,psnr,psnr-y,psnr-u,psnr-v\n"); + } + + for (i = 0; i < n_frames; ++i) { + double frame_ssim; + double frame_psnr, frame_psnry, frame_psnru, frame_psnrv; + + frame_ssim = 0.8 * ssimy[i] + 0.1 * (ssimu[i] + ssimv[i]); + ssimavg += frame_ssim; + ssimyavg += ssimy[i]; + ssimuavg += ssimu[i]; + ssimvavg += ssimv[i]; + + frame_psnr = + mse2psnr(w * h * 6 / 4, peak, (double)psnry[i] + psnru[i] + psnrv[i]); + frame_psnry = mse2psnr(w * h * 4 / 4, peak, (double)psnry[i]); + frame_psnru = mse2psnr(w * h * 1 / 4, peak, (double)psnru[i]); + frame_psnrv = mse2psnr(w * h * 1 / 4, peak, (double)psnrv[i]); + + psnravg += frame_psnr; + psnryavg += frame_psnry; + psnruavg += frame_psnru; + psnrvavg += frame_psnrv; + + psnryglb += psnry[i]; + psnruglb += psnru[i]; + psnrvglb += psnrv[i]; + + if (framestats) { + fprintf(framestats, "%lf,%lf,%lf,%lf,%lf,%lf,%lf,%lf\n", frame_ssim, + ssimy[i], ssimu[i], ssimv[i], frame_psnr, frame_psnry, + frame_psnru, frame_psnrv); + } + } + + ssimavg /= n_frames; + ssimyavg /= n_frames; + ssimuavg /= n_frames; + ssimvavg /= n_frames; + + printf("VpxSSIM: %lf\n", 100 * pow(ssimavg, 8.0)); + printf("SSIM: %lf\n", ssimavg); + printf("SSIM-Y: %lf\n", ssimyavg); + printf("SSIM-U: %lf\n", ssimuavg); + printf("SSIM-V: %lf\n", ssimvavg); + puts(""); + + psnravg /= n_frames; + psnryavg /= n_frames; + psnruavg /= n_frames; + psnrvavg /= n_frames; + + printf("AvgPSNR: %lf\n", psnravg); + printf("AvgPSNR-Y: %lf\n", psnryavg); + printf("AvgPSNR-U: %lf\n", psnruavg); + printf("AvgPSNR-V: %lf\n", psnrvavg); + puts(""); + + psnrglb = psnryglb + psnruglb + psnrvglb; + psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, peak, psnrglb); + psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, peak, psnryglb); + psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnruglb); + psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnrvglb); + + printf("GlbPSNR: %lf\n", psnrglb); + printf("GlbPSNR-Y: %lf\n", psnryglb); + printf("GlbPSNR-U: %lf\n", psnruglb); + printf("GlbPSNR-V: %lf\n", psnrvglb); + puts(""); + + printf("Nframes: %d\n", (int)n_frames); + +clean_up: + + close_input_file(&in[0]); + close_input_file(&in[1]); + + if (framestats) fclose(framestats); + + free(ssimy); + free(ssimu); + free(ssimv); + + free(psnry); + free(psnru); + free(psnrv); + + return return_value; +} diff --git a/libs/libvpx/tools_common.h b/libs/libvpx/tools_common.h index 73ba1bc03b..e41de3195f 100644 --- a/libs/libvpx/tools_common.h +++ b/libs/libvpx/tools_common.h @@ -26,11 +26,21 @@ /* MSVS uses _f{seek,tell}i64. */ #define fseeko _fseeki64 #define ftello _ftelli64 +typedef int64_t FileOffset; #elif defined(_WIN32) /* MinGW uses f{seek,tell}o64 for large files. */ #define fseeko fseeko64 #define ftello ftello64 -#endif /* _WIN32 */ +typedef off64_t FileOffset; +#elif CONFIG_OS_SUPPORT +typedef off_t FileOffset; +/* Use 32-bit file operations in WebM file format when building ARM + * executables (.axf) with RVCT. */ +#else +#define fseeko fseek +#define ftello ftell +typedef long FileOffset; /* NOLINT */ +#endif /* CONFIG_OS_SUPPORT */ #if CONFIG_OS_SUPPORT #if defined(_MSC_VER) @@ -42,13 +52,6 @@ #endif /* _MSC_VER */ #endif /* CONFIG_OS_SUPPORT */ -/* Use 32-bit file operations in WebM file format when building ARM - * executables (.axf) with RVCT. */ -#if !CONFIG_OS_SUPPORT -#define fseeko fseek -#define ftello ftell -#endif /* CONFIG_OS_SUPPORT */ - #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo) #ifndef PATH_MAX diff --git a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c index af566c2c41..8520ab5ca0 100644 --- a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -11,6 +11,7 @@ #include #include #include "./vpx_config.h" +#include "vpx_dsp/arm/mem_neon.h" static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, @@ -21,35 +22,6 @@ static INLINE uint8x8_t load_and_shift(const unsigned char *a) { return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); } -static INLINE void store4x4(unsigned char *dst, int dst_stride, - const uint8x8_t a0, const uint8x8_t a1) { - if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) { - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1); - } else { - // Store to the aligned local buffer and memcpy instead of vget_lane_u8 - // which is really really slow. - uint32_t output_buffer[4]; - vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0); - vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1); - vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0); - vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1); - - memcpy(dst, output_buffer, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 1, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 2, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 3, 4); - } -} - void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, @@ -122,7 +94,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, // secondpass_filter if (yoffset == 0) { // skip_2ndpass_filter - store4x4(dst_ptr, dst_pitch, e0, e1); + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); } else { uint8x8_t f0, f1; const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]); @@ -140,7 +112,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, f0 = vqrshrn_n_u16(b0, 7); f1 = vqrshrn_n_u16(b1, 7); - store4x4(dst_ptr, dst_pitch, f0, f1); + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1)); } } diff --git a/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c index d3a53db21f..d12c3a8392 100644 --- a/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -10,6 +10,8 @@ #include +#include "./vp8_rtcd.h" + void vp8_dc_only_idct_add_neon(int16_t input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { diff --git a/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c index ff5981eaac..5445f2965a 100644 --- a/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -10,8 +10,14 @@ #include +#include "./vp8_rtcd.h" + static const int16_t cospi8sqrt2minus1 = 20091; -static const int16_t sinpi8sqrt2 = 35468; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, int stride) { @@ -60,10 +66,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); - q3 = vshrq_n_s16(q3, 1); q4 = vshrq_n_s16(q4, 1); - q3 = vqaddq_s16(q3, q2); q4 = vqaddq_s16(q4, q2); d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); @@ -90,10 +94,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); - q3 = vshrq_n_s16(q3, 1); q4 = vshrq_n_s16(q4, 1); - q3 = vqaddq_s16(q3, q2); q4 = vqaddq_s16(q4, q2); d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); diff --git a/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c index a36c0c1cac..2724ca236b 100644 --- a/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -10,8 +10,14 @@ #include +#include "./vp8_rtcd.h" + static const int16_t cospi8sqrt2minus1 = 20091; -static const int16_t sinpi8sqrt2 = 35468; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, @@ -40,10 +46,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 - q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); - q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 @@ -71,10 +75,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 - q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); - q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 diff --git a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c index 622baa3c50..aa2567df79 100644 --- a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c +++ b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -9,6 +9,9 @@ */ #include +#include +#include "./vpx_config.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_ports/mem.h" static const int8_t vp8_sub_pel_filters[8][8] = { @@ -22,6 +25,367 @@ static const int8_t vp8_sub_pel_filters[8][8] = { { 0, -1, 12, 123, -6, 0, 0, 0 }, }; +// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters. +// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive. +// Elements 1 and 4 are either 0 or negative. The code accounts for this with +// multiply/accumulates which either add or subtract as needed. The other +// functions will be updated to use this table later. +// It is also expanded to 8 elements to allow loading into 64 bit neon +// registers. +static const uint8_t abs_filters[8][8] = { + { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 }, + { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 }, + { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 }, + { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 }, +}; + +static INLINE uint8x8_t load_and_shift(const unsigned char *a) { + return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); +} + +static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b, + const uint8x8_t filter, uint16x8_t *c, + uint16x8_t *d) { + const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), + vreinterpret_u32_u8(vget_high_u8(a))); + const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), + vreinterpret_u32_u8(vget_high_u8(b))); + *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); + *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); +} + +static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b, + const uint8x8_t filter, uint16x8_t *c, + uint16x8_t *d) { + const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), + vreinterpret_u32_u8(vget_high_u8(a))); + const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), + vreinterpret_u32_u8(vget_high_u8(b))); + *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); + *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); +} + +static INLINE void yonly4x4(const unsigned char *src, int src_stride, + int filter_offset, unsigned char *dst, + int dst_stride) { + uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8; + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; + uint16x8_t c0, c1, c2, c3; + int16x8_t d0, d1; + uint8x8_t e0, e1; + + const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]); + const uint8x8_t filter0 = vdup_lane_u8(filter, 0); + const uint8x8_t filter1 = vdup_lane_u8(filter, 1); + const uint8x8_t filter2 = vdup_lane_u8(filter, 2); + const uint8x8_t filter3 = vdup_lane_u8(filter, 3); + const uint8x8_t filter4 = vdup_lane_u8(filter, 4); + const uint8x8_t filter5 = vdup_lane_u8(filter, 5); + + src -= src_stride * 2; + // Shift the even rows to allow using 'vext' to combine the vectors. armv8 + // has vcopy_lane which would be interesting. This started as just a + // horrible workaround for clang adding alignment hints to 32bit loads: + // https://llvm.org/bugs/show_bug.cgi?id=24421 + // But it turns out it almost identical to casting the loads. + a0 = load_and_shift(src); + src += src_stride; + a1 = vld1_u8(src); + src += src_stride; + a2 = load_and_shift(src); + src += src_stride; + a3 = vld1_u8(src); + src += src_stride; + a4 = load_and_shift(src); + src += src_stride; + a5 = vld1_u8(src); + src += src_stride; + a6 = load_and_shift(src); + src += src_stride; + a7 = vld1_u8(src); + src += src_stride; + a8 = vld1_u8(src); + + // Combine the rows so we can operate on 8 at a time. + b0 = vext_u8(a0, a1, 4); + b2 = vext_u8(a2, a3, 4); + b4 = vext_u8(a4, a5, 4); + b6 = vext_u8(a6, a7, 4); + b8 = a8; + + // To keep with the 8-at-a-time theme, combine *alternate* rows. This + // allows combining the odd rows with the even. + b1 = vext_u8(b0, b2, 4); + b3 = vext_u8(b2, b4, 4); + b5 = vext_u8(b4, b6, 4); + b7 = vext_u8(b6, b8, 4); + + // Multiply and expand to 16 bits. + c0 = vmull_u8(b0, filter0); + c1 = vmull_u8(b2, filter0); + c2 = vmull_u8(b5, filter5); + c3 = vmull_u8(b7, filter5); + + // Multiply, subtract and accumulate for filters 1 and 4 (the negative + // ones). + c0 = vmlsl_u8(c0, b4, filter4); + c1 = vmlsl_u8(c1, b6, filter4); + c2 = vmlsl_u8(c2, b1, filter1); + c3 = vmlsl_u8(c3, b3, filter1); + + // Add more positive ones. vmlal should really return a signed type. + // It's doing signed math internally, as evidenced by the fact we can do + // subtractions followed by more additions. Ideally we could use + // vqmlal/sl but that instruction doesn't exist. Might be able to + // shoehorn vqdmlal/vqdmlsl in here but it would take some effort. + c0 = vmlal_u8(c0, b2, filter2); + c1 = vmlal_u8(c1, b4, filter2); + c2 = vmlal_u8(c2, b3, filter3); + c3 = vmlal_u8(c3, b5, filter3); + + // Use signed saturation math because vmlsl may have left some negative + // numbers in there. + d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); + d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); + + // Use signed again because numbers like -200 need to be saturated to 0. + e0 = vqrshrun_n_s16(d0, 7); + e1 = vqrshrun_n_s16(d1, 7); + + store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1)); +} + +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch) { + uint8x16_t s0, s1, s2, s3, s4; + uint64x2_t s01, s23; + // Variables to hold src[] elements for the given filter[] + uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5; + uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4; + uint8x16_t s01_f0, s23_f0; + uint64x2_t s01_f3, s23_f3; + uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q; + // Accumulator variables. + uint16x8_t d0123, d4567, d89; + uint16x8_t d0123_a, d4567_a, d89_a; + int16x8_t e0123, e4567, e89; + // Second pass intermediates. + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; + uint16x8_t c0, c1, c2, c3; + int16x8_t d0, d1; + uint8x8_t e0, e1; + uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5; + + if (xoffset == 0) { // Second pass only. + yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch); + return; + } + + if (yoffset == 0) { // First pass only. + src_ptr -= 2; + } else { // Add context for the second pass. 2 extra lines on top. + src_ptr -= 2 + (src_pixels_per_line * 2); + } + + filter = vld1_u8(abs_filters[xoffset]); + filter0 = vdup_lane_u8(filter, 0); + filter1 = vdup_lane_u8(filter, 1); + filter2 = vdup_lane_u8(filter, 2); + filter3 = vdup_lane_u8(filter, 3); + filter4 = vdup_lane_u8(filter, 4); + filter5 = vdup_lane_u8(filter, 5); + + // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of + // garbage. So much effort for that last single bit. + // The low values of each pair are for filter0. + s0 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s1 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s2 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s3 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + // Shift to extract values for filter[5] + // If src[] is 0, this puts: + // 3 4 5 6 7 8 9 10 in s0_f5 + // Can't use vshr.u64 because it crosses the double word boundary. + s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); + s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); + s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); + s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); + + s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); + s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); + + s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); + s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); + d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); + d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); + + // Keep original src data as 64 bits to simplify shifting and extracting. + s01 = vreinterpretq_u64_u8(s01_f0); + s23 = vreinterpretq_u64_u8(s23_f0); + + // 3 4 5 6 * filter0 + filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); + + // Shift over one to use -1, 0, 1, 2 for filter1 + // -1 0 1 2 * filter1 + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, + &d0123, &d4567); + + // 2 3 4 5 * filter4 + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, + &d0123, &d4567); + + // 0 1 2 3 * filter2 + filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, + &d0123, &d4567); + + // 1 2 3 4 * filter3 + s01_f3 = vshrq_n_u64(s01, 24); + s23_f3 = vshrq_n_u64(s23, 24); + s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), + vreinterpret_u32_u64(vget_high_u64(s01_f3))); + s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), + vreinterpret_u32_u64(vget_high_u64(s23_f3))); + // Accumulate into different registers so it can use saturated addition. + d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); + d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); + + e0123 = + vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); + e4567 = + vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); + + // Shift and narrow. + b0 = vqrshrun_n_s16(e0123, 7); + b2 = vqrshrun_n_s16(e4567, 7); + + if (yoffset == 0) { // firstpass_filter4x4_only + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2)); + return; + } + + // Load additional context when doing both filters. + s0 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s1 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s2 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s3 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + s4 = vld1q_u8(src_ptr); + + s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); + s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); + s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); + s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); + s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5); + + // 3 4 5 6 * filter0 + s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); + s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); + + s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); + s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); + // But this time instead of 16 pixels to filter, there are 20. So an extra + // run with a doubleword register. + d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); + d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); + d89 = vmull_u8(s4_f5, filter5); + + // Save a copy as u64 for shifting. + s01 = vreinterpretq_u64_u8(s01_f0); + s23 = vreinterpretq_u64_u8(s23_f0); + + filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); + d89 = vmlal_u8(d89, vget_low_u8(s4), filter0); + + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, + &d0123, &d4567); + s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1); + d89 = vmlsl_u8(d89, s4_f1, filter1); + + filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, + &d0123, &d4567); + s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4); + d89 = vmlsl_u8(d89, s4_f4, filter4); + + filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), + vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, + &d0123, &d4567); + s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2); + d89 = vmlal_u8(d89, s4_f2, filter2); + + s01_f3 = vshrq_n_u64(s01, 24); + s23_f3 = vshrq_n_u64(s23, 24); + s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), + vreinterpret_u32_u64(vget_high_u64(s01_f3))); + s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), + vreinterpret_u32_u64(vget_high_u64(s23_f3))); + s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3); + d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); + d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); + d89_a = vmull_u8(s4_f3, filter3); + + e0123 = + vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); + e4567 = + vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); + e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a)); + + b4 = vqrshrun_n_s16(e0123, 7); + b6 = vqrshrun_n_s16(e4567, 7); + b8 = vqrshrun_n_s16(e89, 7); + + // Second pass: 4x4 + filter = vld1_u8(abs_filters[yoffset]); + filter0 = vdup_lane_u8(filter, 0); + filter1 = vdup_lane_u8(filter, 1); + filter2 = vdup_lane_u8(filter, 2); + filter3 = vdup_lane_u8(filter, 3); + filter4 = vdup_lane_u8(filter, 4); + filter5 = vdup_lane_u8(filter, 5); + + b1 = vext_u8(b0, b2, 4); + b3 = vext_u8(b2, b4, 4); + b5 = vext_u8(b4, b6, 4); + b7 = vext_u8(b6, b8, 4); + + c0 = vmull_u8(b0, filter0); + c1 = vmull_u8(b2, filter0); + c2 = vmull_u8(b5, filter5); + c3 = vmull_u8(b7, filter5); + + c0 = vmlsl_u8(c0, b4, filter4); + c1 = vmlsl_u8(c1, b6, filter4); + c2 = vmlsl_u8(c2, b1, filter1); + c3 = vmlsl_u8(c3, b3, filter1); + + c0 = vmlal_u8(c0, b2, filter2); + c1 = vmlal_u8(c1, b4, filter2); + c2 = vmlal_u8(c2, b3, filter3); + c3 = vmlal_u8(c3, b5, filter3); + + d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); + d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); + + e0 = vqrshrun_n_s16(d0, 7); + e1 = vqrshrun_n_s16(d1, 7); + + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); +} + void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch) { diff --git a/libs/libvpx/vp8/common/blockd.h b/libs/libvpx/vp8/common/blockd.h index 74fc5d6dbf..1a3aad16af 100644 --- a/libs/libvpx/vp8/common/blockd.h +++ b/libs/libvpx/vp8/common/blockd.h @@ -169,6 +169,11 @@ typedef struct { typedef struct { FRAME_TYPE frame_type; int is_frame_dropped; + // If frame is dropped due to overshoot after encode_frame. This triggers a + // drop and resets rate control with Q forced to max for following frame. + // The check for this dropping due to overshoot is only done on lowest stream, + // and if set will force drop on all spatial streams for that current frame. + int is_frame_dropped_overshoot_maxqp; // The frame rate for the lowest resolution. double low_res_framerate; /* The frame number of each reference frames */ diff --git a/libs/libvpx/vp8/common/entropymode.c b/libs/libvpx/vp8/common/entropymode.c index 30c2fa86ae..239492a8cb 100644 --- a/libs/libvpx/vp8/common/entropymode.c +++ b/libs/libvpx/vp8/common/entropymode.c @@ -34,12 +34,13 @@ int vp8_mv_cont(const int_mv *l, const int_mv *a) { static const vp8_prob sub_mv_ref_prob[VP8_SUBMVREFS - 1] = { 180, 162, 25 }; -const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT] - [VP8_SUBMVREFS - 1] = { { 147, 136, 18 }, - { 106, 145, 1 }, - { 179, 121, 1 }, - { 223, 1, 34 }, - { 208, 1, 1 } }; +const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT][VP8_SUBMVREFS - 1] = { + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1, 34 }, + { 208, 1, 1 } +}; const vp8_mbsplit vp8_mbsplits[VP8_NUMMBSPLITS] = { { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, diff --git a/libs/libvpx/vp8/common/entropymode.h b/libs/libvpx/vp8/common/entropymode.h index e0a17df10c..b3fad19be0 100644 --- a/libs/libvpx/vp8/common/entropymode.h +++ b/libs/libvpx/vp8/common/entropymode.h @@ -78,8 +78,8 @@ extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES - 1]; void vp8_init_mbmode_probs(VP8_COMMON *x); void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]); -void vp8_kf_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES] - [VP8_BINTRAMODES - 1]); +void vp8_kf_default_bmode_probs( + vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES][VP8_BINTRAMODES - 1]); #ifdef __cplusplus } // extern "C" diff --git a/libs/libvpx/vp8/common/generic/systemdependent.c b/libs/libvpx/vp8/common/generic/systemdependent.c index 89abd41c09..0432ee9887 100644 --- a/libs/libvpx/vp8/common/generic/systemdependent.c +++ b/libs/libvpx/vp8/common/generic/systemdependent.c @@ -14,6 +14,8 @@ #include "vpx_ports/arm.h" #elif ARCH_X86 || ARCH_X86_64 #include "vpx_ports/x86.h" +#elif ARCH_PPC +#include "vpx_ports/ppc.h" #endif #include "vp8/common/onyxc_int.h" #include "vp8/common/systemdependent.h" @@ -83,8 +85,6 @@ static int get_cpu_count() { } #endif -void vp8_clear_system_state_c(){}; - void vp8_machine_specific_config(VP8_COMMON *ctx) { #if CONFIG_MULTITHREAD ctx->processor_core_count = get_cpu_count(); @@ -96,5 +96,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) { ctx->cpu_caps = arm_cpu_caps(); #elif ARCH_X86 || ARCH_X86_64 ctx->cpu_caps = x86_simd_caps(); +#elif ARCH_PPC + ctx->cpu_caps = ppc_simd_caps(); #endif } diff --git a/libs/libvpx/vp8/common/loopfilter_filters.c b/libs/libvpx/vp8/common/loopfilter_filters.c index 1f60721e1c..188e290ca7 100644 --- a/libs/libvpx/vp8/common/loopfilter_filters.c +++ b/libs/libvpx/vp8/common/loopfilter_filters.c @@ -63,8 +63,8 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0, filter_value &= mask; /* save bottom 3 bits so that we round one side +4 and the other +3 - * if it equals 4 we'll set to adjust by -1 to account for the fact - * we'd round 3 the other way + * if it equals 4 we'll set it to adjust by -1 to account for the fact + * we'd round it by 3 the other way */ Filter1 = vp8_signed_char_clamp(filter_value + 4); Filter2 = vp8_signed_char_clamp(filter_value + 3); @@ -86,10 +86,12 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0, u = vp8_signed_char_clamp(ps1 + filter_value); *op1 = u ^ 0x80; } -void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { + +static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { int hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -109,10 +111,11 @@ void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ } while (++i < count * 8); } -void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { +static void loop_filter_vertical_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { int hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -185,11 +188,11 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0, *op2 = s ^ 0x80; } -void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count) { +static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { signed char hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -210,10 +213,11 @@ void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p, } while (++i < count * 8); } -void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { +static void mbloop_filter_vertical_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { signed char hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -295,17 +299,17 @@ void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p, void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } @@ -313,17 +317,17 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } @@ -331,21 +335,21 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); if (u_ptr) { - vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); + loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 1); } if (v_ptr) { - vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); + loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 1); } } @@ -363,21 +367,21 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 1); + loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 1); + loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); } } diff --git a/libs/libvpx/vp8/common/mfqe.c b/libs/libvpx/vp8/common/mfqe.c index 5aace8c99d..b6f8146b84 100644 --- a/libs/libvpx/vp8/common/mfqe.c +++ b/libs/libvpx/vp8/common/mfqe.c @@ -74,8 +74,7 @@ static void apply_ifactor(unsigned char *y_src, int y_src_stride, src_weight); vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); - } else /* if (block_size == 8) */ - { + } else { vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, @@ -136,8 +135,7 @@ static void multiframe_quality_enhance_block( usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6; vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6; #endif - } else /* if (blksize == 8) */ - { + } else { actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6; act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6; #ifdef USE_SSD @@ -186,14 +184,12 @@ static void multiframe_quality_enhance_block( apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd, uvd_stride, blksize, ifactor); } - } else /* else implicitly copy from previous frame */ - { + } else { /* else implicitly copy from previous frame */ if (blksize == 16) { vp8_copy_mem16x16(y, y_stride, yd, yd_stride); vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride); vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride); - } else /* if (blksize == 8) */ - { + } else { vp8_copy_mem8x8(y, y_stride, yd, yd_stride); for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride) { @@ -297,8 +293,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) { } } } - } else /* totmap = 4 */ - { + } else { /* totmap = 4 */ multiframe_quality_enhance_block( 16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride, show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride, diff --git a/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c index 7612024b7d..e46827b0e4 100644 --- a/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c +++ b/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c @@ -673,9 +673,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), - [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2), + [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2), [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), [src_ptr] "r"(src_ptr)); @@ -724,9 +724,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2), - [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1), + [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1), [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a), [vector3b] "r"(vector3b)); @@ -781,9 +781,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [Temp4] "=r"(Temp4) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), - [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), + [Temp4] "=r"(Temp4), [tp1] "+r"(tp1) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4), + [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); @@ -1469,6 +1469,7 @@ void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr, unsigned char src_ptr_r2; unsigned char src_ptr_r3; unsigned char *cm = ff_cropTbl + CROP_WIDTH; + (void)output_width; vector4a = 64; diff --git a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c index b79af1cc88..d2c3442515 100644 --- a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c +++ b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c @@ -306,6 +306,7 @@ void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p, uint32_t hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; mask = 0; hev = 0; @@ -498,6 +499,7 @@ void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p, uint32_t hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; mask = 0; hev = 0; @@ -918,6 +920,7 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p, uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *s1, *s2, *s3, *s4; uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + (void)count; /* loop filter designed to work using chars so that we can make maximum use * of 8 bit simd instructions. @@ -1612,6 +1615,7 @@ void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p, uint32_t mask, hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; mask = 0; hev = 0; @@ -1915,6 +1919,7 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p, uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *s1, *s2, *s3, *s4; uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + (void)count; mask = 0; hev = 0; diff --git a/libs/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/libs/libvpx/vp8/common/mips/mmi/copymem_mmi.c new file mode 100644 index 0000000000..86a32aa9ef --- /dev/null +++ b/libs/libvpx/vp8/common/mips/mmi/copymem_mmi.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define COPY_MEM_16X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + "ldl %[tmp0], 0x0f(%[src]) \n\t" \ + "ldr %[tmp0], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + "sdl %[tmp0], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp0], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "ldl %[tmp1], 0x0f(%[src]) \n\t" \ + "ldr %[tmp1], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp1], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp1], 0x00(%[dst]) \n\t" \ + "sdl %[tmp1], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp1], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +#define COPY_MEM_8X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "ldl %[tmp0], 0x07(%[src]) \n\t" \ + "ldr %[tmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "sdl %[tmp0], 0x07(%[dst]) \n\t" \ + "sdr %[tmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + double ftmp[2]; + uint64_t tmp[2]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_16X2 + COPY_MEM_16X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_8X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + + /* clang-format off */ + __asm__ volatile ( + COPY_MEM_8X2 + COPY_MEM_8X2 + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} diff --git a/libs/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/libs/libvpx/vp8/common/mips/mmi/dequantize_mmi.c new file mode 100644 index 0000000000..b3f8084aee --- /dev/null +++ b/libs/libvpx/vp8/common/mips/mmi/dequantize_mmi.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[qcoeff]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[DQC]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[DQC]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[DQC]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[DQC]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[DQC]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[DQC]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[DQC]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[DQC]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[dqcoeff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC) + : "memory"); +} + +void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest, + int stride) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[dq]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[dq]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[dq]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[dq]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[dq]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[dq]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[dq]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[dq]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[input]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[input]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[input]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[input]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[input]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[input]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[input]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[input]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[input]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[input]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[input]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dq] "r"(dq), [input] "r"(input) + : "memory"); + + vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride); + + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "sdl $0, 0x0f(%[input]) \n\t" + "sdr $0, 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[input]) \n\t" + "sdl $0, 0x1f(%[input]) \n\t" + "sdr $0, 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]) + : [input] "r"(input) + : "memory"); +} diff --git a/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c new file mode 100644 index 0000000000..f6020ab468 --- /dev/null +++ b/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, + int stride, int8_t *eobs) { + int i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu, + uint8_t *dstv, int stride, + int8_t *eobs) { + int i, j; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dstu, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstu += 4; + } + + dstu += 4 * stride - 8; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dstv, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstv += 4; + } + + dstv += 4 * stride - 8; + } +} diff --git a/libs/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/libs/libvpx/vp8/common/mips/mmi/idctllm_mmi.c new file mode 100644 index 0000000000..5e48f59166 --- /dev/null +++ b/libs/libvpx/vp8/common/mips/mmi/idctllm_mmi.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define TRANSPOSE_4H \ + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + +void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + double ftmp[12]; + uint32_t tmp[0]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL }; + + __asm__ volatile ( + MMI_LI(%[tmp0], 0x02) + "mtc1 %[tmp0], %[ftmp11] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + + // ip[0...3] + ip[8...11] + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // ip[0...3] - ip[8...11] + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // (ip[12...15] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t" + // (ip[ 4... 7] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t" + // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t" + // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t" + "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // b + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // c + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t" + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + // d + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t" + "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "mtc1 %[tmp0], %[ftmp11] \n\t" + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), + [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr) + : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3), + [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04), + [pred_stride]"r"((mips_reg)pred_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int a1 = ((input_dc + 4) >> 3); + double ftmp[5]; + int low32; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pshufh %[a1], %[a1], %[ftmp0] \n\t" + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32), + [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr) + : [dst_stride]"r"((mips_reg)dst_stride), + [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1) + : "memory" + ); +} + +void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { + int i; + int16_t output[16]; + double ftmp[12]; + uint32_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL }; + + __asm__ volatile ( + MMI_LI(%[tmp0], 0x03) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // d + "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t" + // b + "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c + "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]) + : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03) + : "memory" + ); + + for (i = 0; i < 16; i++) { + mb_dqcoeff[i * 16] = output[i]; + } +} diff --git a/libs/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/libs/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c new file mode 100644 index 0000000000..f2182f95c9 --- /dev/null +++ b/libs/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c @@ -0,0 +1,1337 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" +#include "vpx_ports/asmdefs_mmi.h" + +DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; +DECLARE_ALIGNED(8, static const uint64_t, + ff_ph_003f) = { 0x003f003f003f003fULL }; +DECLARE_ALIGNED(8, static const uint64_t, + ff_ph_0900) = { 0x0900090009000900ULL }; +DECLARE_ALIGNED(8, static const uint64_t, + ff_ph_1200) = { 0x1200120012001200ULL }; +DECLARE_ALIGNED(8, static const uint64_t, + ff_ph_1b00) = { 0x1b001b001b001b00ULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL }; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL }; + +void vp8_loop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint32_t tmp[1]; + mips_reg addr[2]; + double ftmp[12]; + __asm__ volatile ( + "1: \n\t" + "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t" + "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t" + + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t" + "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t" + + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), + [ff_ph_01]"f"(ff_ph_01), [ff_pb_fe]"f"(ff_pb_fe), + [ff_pb_80]"f"(ff_pb_80), [ff_pb_04]"f"(ff_pb_04), + [ff_pb_03]"f"(ff_pb_03) + : "memory" + ); +} + +void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count) { + uint32_t tmp[1]; + mips_reg addr[2]; + double ftmp[13]; + + __asm__ volatile ( + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SLL (%[tmp0], %[src_pixel_step], 0x01) + MMI_ADDU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + + "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t" + "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t" + "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t" + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t" + + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t" + + /* abs (p0-q0) */ + "pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + /* abs (p1-q1) */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t" + "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "xor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp0:mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t" + + /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "or %[ftmp2], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1:hev */ + "xor %[ftmp1], %[ftmp2], %[ftmp1] \n\t" + + "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2:filter_value */ + "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t" + "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t" + + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t" + + "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t" + + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t" + "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t" + "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t" + "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + /* ftmp5: *op1 ; ftmp6: *op0 */ + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + /* ftmp9: *oq0 ; ftmp10: *oq1 */ + "punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + + "dsrl %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t" + + "dsrl %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + "dsrl %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + MMI_ADDIU(%[count], %[count], -0x01) + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [ff_ph_01]"f"(ff_ph_01), [ff_pb_03]"f"(ff_pb_03), + [ff_pb_04]"f"(ff_pb_04), [ff_pb_80]"f"(ff_pb_80), + [ff_pb_fe]"f"(ff_pb_fe) + : "memory" + ); +} + +/* clang-format off */ +#define VP8_MBLOOP_HPSRAB \ + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \ + "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \ + "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \ + "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t" + +#define VP8_MBLOOP_HPSRAB_ADD(reg) \ + "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \ + "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \ + "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \ + "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \ + "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \ + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \ + "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" +/* clang-format on */ + +void vp8_mbloop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint32_t tmp[1]; + double ftmp[13]; + + __asm__ volatile ( + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "1: \n\t" + "gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t" + /* ftmp1: p3 */ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + /* ftmp3: p2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + /* ftmp4: p1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + /* ftmp5: p0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + /* ftmp6: q0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + /* ftmp7: q1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + /* ftmp8: q2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + /* ftmp2: q3 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t" + + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "and %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + /* ftmp1: hev */ + "xor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t" + "and %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t" + VP8_MBLOOP_HPSRAB + "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t" + VP8_MBLOOP_HPSRAB + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00]) + "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "xor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900]) + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "xor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), + [ff_pb_04]"f"(ff_pb_04), [ff_pb_03]"f"(ff_pb_03), + [ff_ph_0900]"f"(ff_ph_0900), [ff_ph_1b00]"f"(ff_ph_1b00), + [ff_ph_1200]"f"(ff_ph_1200), [ff_ph_003f]"f"(ff_ph_003f) + : "memory" + ); +} + +#define VP8_MBLOOP_VPSRAB_ADDH \ + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + +#define VP8_MBLOOP_VPSRAB_ADDT \ + "paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \ + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \ + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" + +void vp8_mbloop_filter_vertical_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + mips_reg tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, srct[1]); + double ftmp[14]; + + __asm__ volatile ( + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t" + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* srct[0x00]: q3 */ + "sdc1 %[ftmp12], 0x00(%[srct]) \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* srct[0x08]: p3 */ + "sdc1 %[ftmp1], 0x08(%[srct]) \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t" + /* abs (p0-q0) * 2 */ + "pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* abs (p1-q1) / 2 */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "and %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t" + "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + + /* abs(p1-p0) - thresh */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + /* abs(q1-q0) - thresh */ + "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "or %[ftmp3], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1: hev */ + "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + + /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */ + "xor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t" + "xor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "xor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "xor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + + "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t" + "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + /* filter_value &= mask */ + "and %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + /* Filter2 = filter_value & hev */ + "and %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + /* filter_value &= ~hev */ + "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t" + + "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t" + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp12] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t" + /* ftmp9: qs0 */ + "psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" + /* ftmp6: ps0 */ + "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp12] \n\t" + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t" + /* ftmp9: oq0 */ + "xor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t" + /* ftmp6: op0 */ + "xor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t" + /* ftmp10: oq1 */ + "xor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t" + /* ftmp5: op1 */ + "xor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t" + /* ftmp11: oq2 */ + "xor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2: op2 */ + "xor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp12], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp8], 0x08(%[srct]) \n\t" + + "punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t" + "punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t" + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" + + "punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t" + + "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t" + "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t" + "punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t" + "punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "addiu %[count], %[count], -0x01 \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), + [count]"+&r"(count) + : [limit]"r"(limit), [blimit]"r"(blimit), + [srct]"r"(srct), [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [ff_ph_003f]"f"(ff_ph_003f), [ff_ph_0900]"f"(ff_ph_0900), + [ff_pb_03]"f"(ff_pb_03), [ff_pb_04]"f"(ff_pb_04), + [ff_pb_80]"f"(ff_pb_80), [ff_pb_fe]"f"(ff_pb_fe) + : "memory" + ); +} + +#define VP8_SIMPLE_HPSRAB \ + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \ + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \ + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \ + "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \ + "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \ + "or %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + +void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint32_t tmp[1], count = 2; + mips_reg addr[2]; + double ftmp[12]; + + __asm__ volatile ( + "li %[tmp0], 0x08 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "li %[tmp0], 0x03 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t" + "and %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + "pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + + "xor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "xor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "xor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "and %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + VP8_SIMPLE_HPSRAB + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + VP8_SIMPLE_HPSRAB + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [blimit]"r"(blimit), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), + [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01) + : "memory" + ); +} + +void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint32_t tmp[1], count = 2; + mips_reg addr[2]; + DECLARE_ALIGNED(8, const uint64_t, srct[1]); + double ftmp[12]; + + __asm__ volatile ( + "li %[tmp0], 0x08 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x02) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t" + "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + + "li %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t" + "and %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" + "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + + "sdc1 %[ftmp0], 0x00(%[srct]) \n\t" + "sdc1 %[ftmp3], 0x08(%[srct]) \n\t" + + "xor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t" + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + + "xor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t" + "xor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t" + "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "and %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + + "li %[tmp0], 0x03 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "xor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + + "li %[tmp0], 0x03 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "li %[tmp0], 0x0b \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "or %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp0], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp4], 0x08(%[srct]) \n\t" + + "punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t" + "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + + "punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + "dsrl %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "dsrl %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + "dsrl %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + + "dsrl %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8]) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count) + : [blimit]"r"(blimit), [srct]"r"(srct), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), + [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)), + [ff_pb_fe]"f"(ff_pb_fe), [ff_pb_80]"f"(ff_pb_80), + [ff_pb_04]"f"(ff_pb_04), [ff_pb_01]"f"(ff_pb_01) + : "memory" + ); +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + blimit); +} + +void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit); +} diff --git a/libs/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/libs/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c new file mode 100644 index 0000000000..77d665d456 --- /dev/null +++ b/libs/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/filter.h" +#include "vpx_ports/asmdefs_mmi.h" + +DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = { + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } +}; + +/* Horizontal filter: pixel_step is 1, output_height and output_width are + the size of horizontal filtering output, output_height is always H + 5 */ +static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp8_filter) { + uint32_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "xor %[fzero], %[fzero], %[fzero] \n\t" + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "li %[tmp0], 0x08 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" + + "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t" + "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [src_ptr]"+&r"(src_ptr) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width), + [ff_ph_40]"f"(ff_ph_40) + : "memory" + ); +} + +/* Horizontal filter: pixel_step is always W */ +static INLINE void vp8_filter_block1dc_v6_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + uint32_t tmp[1]; + mips_reg addr[1]; +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); + register double ftmp12 asm("$f26"); + register double ftmp13 asm("$f28"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); + register double ftmp12 asm("$f13"); + register double ftmp13 asm("$f14"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "xor %[fzero], %[fzero], %[fzero] \n\t" + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp13] \n\t" + + /* In order to make full use of memory load delay slot, + * Operation of memory loading and calculating has been rearranged. + */ + "1: \n\t" + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line]) + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t" + + "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + + "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t" + + "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + + "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t" + + "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t" + + "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t" + + "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t" + "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t" + + MMI_ADDIU(%[output_height], %[output_height], -0x01) + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12), + [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), + [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), + [vp8_filter]"r"(vp8_filter), + [output_pitch]"r"((mips_reg)output_pitch), + [ff_ph_40]"f"(ff_ph_40) + : "memory" + ); +} + +/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, + function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can + be simplified */ +static INLINE void vp8_filter_block1d_h6_filter0_mmi( + unsigned char *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int output_height, + unsigned int output_width) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "xor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + + "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [output_width]"r"(output_width) + : "memory" + ); +} + +static INLINE void vp8_filter_block1dc_v6_filter0_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "xor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDIU(%[output_height], %[output_height], -0x01) + "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); +} + +#define sixtapNxM(n, m) \ + void vp8_sixtap_predict##n##x##m##_mmi( \ + unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \ + int yoffset, unsigned char *dst_ptr, int dst_pitch) { \ + DECLARE_ALIGNED(16, uint16_t, \ + FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \ + const int16_t *HFilter, *VFilter; \ + int i, loop = n / 4; \ + HFilter = vp8_six_tap_mmi[xoffset]; \ + VFilter = vp8_six_tap_mmi[yoffset]; \ + \ + if (xoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_filter0_mmi( \ + src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \ + src_pixels_per_line, m + 5, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \ + FData2 + i * 4, src_pixels_per_line, m + 5, \ + n * 2, HFilter); \ + } \ + } \ + if (yoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_filter0_mmi( \ + FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \ + dst_pitch, n * 2, VFilter); \ + } \ + } \ + } + +sixtapNxM(4, 4); +sixtapNxM(8, 8); +sixtapNxM(8, 4); +sixtapNxM(16, 16); diff --git a/libs/libvpx/vp8/common/mips/msa/idct_msa.c b/libs/libvpx/vp8/common/mips/msa/idct_msa.c index e1759c875e..3d516d0f81 100644 --- a/libs/libvpx/vp8/common/mips/msa/idct_msa.c +++ b/libs/libvpx/vp8/common/mips/msa/idct_msa.c @@ -90,8 +90,7 @@ static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred, v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 res0, res1, res2, res3; v16i8 zero = { 0 }; - v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3; - v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; + v16i8 pred0, pred1, pred2, pred3; LD_SH2(input, 8, input0, input1); UNPCK_SH_SW(input0, in0, in1); @@ -111,20 +110,17 @@ static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred, res1 = CLIP_SW_0_255(res1); res2 = CLIP_SW_0_255(res2); res3 = CLIP_SW_0_255(res3); - LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3); - VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1); - VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3); - ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride); } static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred, int32_t pred_stride, uint8_t *dest, int32_t dest_stride) { - v8i16 vec; - v8i16 res0, res1, res2, res3; + v8i16 vec, res0, res1, res2, res3, dst0, dst1; v16i8 zero = { 0 }; - v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3; - v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; + v16i8 pred0, pred1, pred2, pred3; vec = __msa_fill_h(in_dc); vec = __msa_srari_h(vec, 3); @@ -133,55 +129,59 @@ static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred, res2, res3); ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3); - LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3); - VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1); - VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3); - ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SH(res1, res0, res3, res2, dst0, dst1); + dst0 = (v8i16)__msa_pckev_w((v4i32)dst1, (v4i32)dst0); + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride); } void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) { - v8i16 input0, input1; - v4i32 in0, in1, in2, in3, a1, b1, c1, d1; - v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1; + const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; + const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; + const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; LD_SH2(input, 8, input0, input1); - UNPCK_SH_SW(input0, in0, in1); - UNPCK_SH_SW(input1, in2, in3); - BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1); - BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2); - TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); - BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1); - BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2); - ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3); - SRA_4V(vt0, vt1, vt2, vt3, 3); - mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0); - mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0); - mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0); - mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0); - mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2); - mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2); - mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2); - mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2); - mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4); - mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4); - mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4); - mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4); - mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6); - mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6); - mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6); - mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6); + input1 = (v8i16)__msa_sldi_b((v16i8)input1, (v16i8)input1, 8); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + out0 = tmp2 + tmp3; + out1 = tmp2 - tmp3; + VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + tmp0 = tmp2 + tmp3; + tmp1 = tmp2 - tmp3; + ADD2(tmp0, 3, tmp1, 3, out0, out1); + out0 >>= 3; + out1 >>= 3; + mb_dq_coeff[0] = __msa_copy_s_h(out0, 0); + mb_dq_coeff[16] = __msa_copy_s_h(out0, 4); + mb_dq_coeff[32] = __msa_copy_s_h(out1, 0); + mb_dq_coeff[48] = __msa_copy_s_h(out1, 4); + mb_dq_coeff[64] = __msa_copy_s_h(out0, 1); + mb_dq_coeff[80] = __msa_copy_s_h(out0, 5); + mb_dq_coeff[96] = __msa_copy_s_h(out1, 1); + mb_dq_coeff[112] = __msa_copy_s_h(out1, 5); + mb_dq_coeff[128] = __msa_copy_s_h(out0, 2); + mb_dq_coeff[144] = __msa_copy_s_h(out0, 6); + mb_dq_coeff[160] = __msa_copy_s_h(out1, 2); + mb_dq_coeff[176] = __msa_copy_s_h(out1, 6); + mb_dq_coeff[192] = __msa_copy_s_h(out0, 3); + mb_dq_coeff[208] = __msa_copy_s_h(out0, 7); + mb_dq_coeff[224] = __msa_copy_s_h(out1, 3); + mb_dq_coeff[240] = __msa_copy_s_h(out1, 7); } static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, uint8_t *dest, int32_t dest_stride) { v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1; - v8i16 in0, in1, in2, in3; - v8i16 hz0_h, hz1_h, hz2_h, hz3_h; - v16i8 dest0, dest1, dest2, dest3; - v4i32 hz0_w, hz1_w, hz2_w, hz3_w; - v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3; + v8i16 in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h; + v16u8 dest0, dest1, dest2, dest3; + v4i32 hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3, res0, res1, res2, res3; v2i64 zero = { 0 }; - v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; LD_SH2(input, 8, input0, input1); LD_SH2(dequant_input, 8, dequant_in0, dequant_in1); @@ -196,7 +196,7 @@ static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3); SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); - LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3); + LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3); ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1, res2, res3); ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2, @@ -206,19 +206,17 @@ static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, res1 = CLIP_SW_0_255(res1); res2 = CLIP_SW_0_255(res2); res3 = CLIP_SW_0_255(res3); - VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1); - VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3); - ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride); } static void dequant_idct4x4_addblk_2x_msa(int16_t *input, int16_t *dequant_input, uint8_t *dest, int32_t dest_stride) { v16u8 dest0, dest1, dest2, dest3; - v8i16 in0, in1, in2, in3; - v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1; - v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; - v8i16 res0, res1, res2, res3; + v8i16 in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1; + v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3; v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r; v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r; v16i8 zero = { 0 }; @@ -247,11 +245,8 @@ static void dequant_idct4x4_addblk_2x_msa(int16_t *input, res2, res3); ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3); - PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2, - res3); - PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1); - PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3); - ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SW(res1, res0, res3, res2, vt0l, vt1l); + ST8x4_UB(vt0l, vt1l, dest, dest_stride); __asm__ __volatile__( "sw $zero, 0(%[input]) \n\t" @@ -276,10 +271,9 @@ static void dequant_idct4x4_addblk_2x_msa(int16_t *input, static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input, uint8_t *dest, int32_t dest_stride) { - v8i16 input_dc0, input_dc1, vec; + v8i16 input_dc0, input_dc1, vec, res0, res1, res2, res3; v16u8 dest0, dest1, dest2, dest3; v16i8 zero = { 0 }; - v8i16 res0, res1, res2, res3; input_dc0 = __msa_fill_h(input[0] * dequant_input[0]); input_dc1 = __msa_fill_h(input[16] * dequant_input[0]); @@ -292,11 +286,8 @@ static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input, res2, res3); ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3); - PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2, - res3); - PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1); - PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3); - ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SH(res1, res0, res3, res2, res0, res1); + ST8x4_UB(res0, res1, dest, dest_stride); } void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr, diff --git a/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c b/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c index f5f1790ef5..98a4fc09a3 100644 --- a/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c +++ b/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c @@ -24,208 +24,145 @@ mask = ((v16u8)mask <= b_limit); \ } -#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \ - mask_in, hev_in) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - filt = filt & (v16i8)hev_in; \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask_in; \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80); \ +#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt &= hev; \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + t1 = __msa_adds_s_b(filt, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(filt, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ + filt = __msa_srari_b(t1, 1); \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ } -#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ - q0_sub_p0_r *= cnst3h; \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ - q0_sub_p0_l *= cnst3h; \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)(mask); \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \ +#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \ + v16i8 q0_sub_p0; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= cnst3b; \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \ } -#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ - { \ - v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ - v16i8 filt, q0_sub_p0, cnst4b, cnst3b; \ - v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l; \ - v8i16 cnst3h, cnst27h, cnst18h, cnst63h; \ - \ - cnst3h = __msa_ldi_h(3); \ - \ - p2_m = (v16i8)__msa_xori_b(p2, 0x80); \ - p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ - q2_m = (v16i8)__msa_xori_b(q2, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - q0_sub_p0 = q0_m - p0_m; \ - q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ - q0_sub_p0_r *= cnst3h; \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r = filt_r + q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ - q0_sub_p0_l *= cnst3h; \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l = filt_l + q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask; \ - filt2 = filt & (v16i8)hev; \ - \ - hev = __msa_xori_b(hev, 0xff); \ - filt = filt & (v16i8)hev; \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt2, cnst4b); \ - filt1 >>= 3; \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt2, cnst3b); \ - filt2 >>= 3; \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - \ - filt_sign = __msa_clti_s_b(filt, 0); \ - ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ - \ - cnst27h = __msa_ldi_h(27); \ - cnst63h = __msa_ldi_h(63); \ - \ - u_r = filt_r * cnst27h; \ - u_r += cnst63h; \ - u_r >>= 7; \ - u_r = __msa_sat_s_h(u_r, 7); \ - u_l = filt_l * cnst27h; \ - u_l += cnst63h; \ - u_l >>= 7; \ - u_l = __msa_sat_s_h(u_l, 7); \ - u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ - q0_m = __msa_subs_s_b(q0_m, u); \ - q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, u); \ - p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ - cnst18h = __msa_ldi_h(18); \ - u_r = filt_r * cnst18h; \ - u_r += cnst63h; \ - u_r >>= 7; \ - u_r = __msa_sat_s_h(u_r, 7); \ - \ - u_l = filt_l * cnst18h; \ - u_l += cnst63h; \ - u_l >>= 7; \ - u_l = __msa_sat_s_h(u_l, 7); \ - u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ - q1_m = __msa_subs_s_b(q1_m, u); \ - q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, u); \ - p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ - u_r = filt_r << 3; \ - u_r += filt_r + cnst63h; \ - u_r >>= 7; \ - u_r = __msa_sat_s_h(u_r, 7); \ - \ - u_l = filt_l << 3; \ - u_l += filt_l + cnst63h; \ - u_l >>= 7; \ - u_l = __msa_sat_s_h(u_l, 7); \ - u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ - q2_m = __msa_subs_s_b(q2_m, u); \ - q2 = __msa_xori_b((v16u8)q2_m, 0x80); \ - p2_m = __msa_adds_s_b(p2_m, u); \ - p2 = __msa_xori_b((v16u8)p2_m, 0x80); \ +#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ + { \ + v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ + v16i8 u, filt, t1, t2, filt_sign, q0_sub_p0; \ + v8i16 filt_r, filt_l, u_r, u_l; \ + v8i16 temp0, temp1, temp2, temp3; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + const v8i16 cnst9h = __msa_ldi_h(9); \ + const v8i16 cnst63h = __msa_ldi_h(63); \ + \ + p2_m = (v16i8)__msa_xori_b(p2, 0x80); \ + p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ + q2_m = (v16i8)__msa_xori_b(q2, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + \ + t2 = filt & hev; \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + t1 = __msa_adds_s_b(t2, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(t2, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + filt_sign = __msa_clti_s_b(filt, 0); \ + ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ + temp0 = filt_r * cnst9h; \ + temp1 = temp0 + cnst63h; \ + temp2 = filt_l * cnst9h; \ + temp3 = temp2 + cnst63h; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q2_m = __msa_subs_s_b(q2_m, u); \ + p2_m = __msa_adds_s_b(p2_m, u); \ + q2 = __msa_xori_b((v16u8)q2_m, 0x80); \ + p2 = __msa_xori_b((v16u8)p2_m, 0x80); \ + \ + temp1 += temp0; \ + temp3 += temp2; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q1_m = __msa_subs_s_b(q1_m, u); \ + p1_m = __msa_adds_s_b(p1_m, u); \ + q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ + \ + temp1 += temp0; \ + temp3 += temp2; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q0_m = __msa_subs_s_b(q0_m, u); \ + p0_m = __msa_adds_s_b(p0_m, u); \ + q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ } #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ diff --git a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h index 65905f6c02..6bec3adec3 100644 --- a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h +++ b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h @@ -1221,6 +1221,8 @@ } #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) +#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3) \ diff --git a/libs/libvpx/vp8/common/onyx.h b/libs/libvpx/vp8/common/onyx.h index 43e3c29b50..72fba2ec56 100644 --- a/libs/libvpx/vp8/common/onyx.h +++ b/libs/libvpx/vp8/common/onyx.h @@ -110,6 +110,8 @@ typedef struct { int Sharpness; int cpu_used; unsigned int rc_max_intra_bitrate_pct; + /* percent of rate boost for golden frame in CBR mode. */ + unsigned int gf_cbr_boost_pct; unsigned int screen_content_mode; /* mode -> diff --git a/libs/libvpx/vp8/common/onyxd.h b/libs/libvpx/vp8/common/onyxd.h index e05461aad0..d3c1b0e972 100644 --- a/libs/libvpx/vp8/common/onyxd.h +++ b/libs/libvpx/vp8/common/onyxd.h @@ -22,6 +22,7 @@ extern "C" { #include "vpx/vp8.h" struct VP8D_COMP; +struct VP8Common; typedef struct { int Width; @@ -45,6 +46,7 @@ int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size, int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags); +int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame); vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp, enum vpx_ref_frame_type ref_frame_flag, @@ -52,6 +54,7 @@ vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp, vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); +int vp8dx_get_quantizer(const struct VP8D_COMP *c); #ifdef __cplusplus } diff --git a/libs/libvpx/vp8/common/postproc.c b/libs/libvpx/vp8/common/postproc.c index 8b8c1701a3..d67ee8a57d 100644 --- a/libs/libvpx/vp8/common/postproc.c +++ b/libs/libvpx/vp8/common/postproc.c @@ -12,6 +12,7 @@ #include "vpx_dsp_rtcd.h" #include "vp8_rtcd.h" #include "vpx_dsp/postproc.h" +#include "vpx_ports/system_state.h" #include "vpx_scale_rtcd.h" #include "vpx_scale/yv12config.h" #include "postproc.h" @@ -321,7 +322,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, } } - vp8_clear_system_state(); + vpx_clear_system_state(); if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid && oci->current_video_frame >= 2 && @@ -363,7 +364,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, oci->postproc_state.last_noise != noise_level) { double sigma; struct postproc_state *ppstate = &oci->postproc_state; - vp8_clear_system_state(); + vpx_clear_system_state(); sigma = noise_level + .5 + .6 * q / 63.0; ppstate->clamp = vpx_setup_noise(sigma, ppstate->generated_noise, oci->Width + 256); diff --git a/libs/libvpx/vp8/common/ppflags.h b/libs/libvpx/vp8/common/ppflags.h index 713f5dffe0..96e3af6c9c 100644 --- a/libs/libvpx/vp8/common/ppflags.h +++ b/libs/libvpx/vp8/common/ppflags.h @@ -19,14 +19,7 @@ enum { VP8D_DEBLOCK = 1 << 0, VP8D_DEMACROBLOCK = 1 << 1, VP8D_ADDNOISE = 1 << 2, - VP8D_DEBUG_TXT_FRAME_INFO = 1 << 3, - VP8D_DEBUG_TXT_MBLK_MODES = 1 << 4, - VP8D_DEBUG_TXT_DC_DIFF = 1 << 5, - VP8D_DEBUG_TXT_RATE_INFO = 1 << 6, - VP8D_DEBUG_DRAW_MV = 1 << 7, - VP8D_DEBUG_CLR_BLK_MODES = 1 << 8, - VP8D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9, - VP8D_MFQE = 1 << 10 + VP8D_MFQE = 1 << 3 }; typedef struct { diff --git a/libs/libvpx/vp8/common/reconintra.c b/libs/libvpx/vp8/common/reconintra.c index 986074ec7e..8e2094da87 100644 --- a/libs/libvpx/vp8/common/reconintra.c +++ b/libs/libvpx/vp8/common/reconintra.c @@ -71,8 +71,16 @@ void vp8_build_intra_predictors_mbuv_s( unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) { MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode; +#if HAVE_VSX + /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from + uleft_col and vleft_col. Play it safe by reserving enough stack + space here. */ + unsigned char uleft_col[16]; + unsigned char vleft_col[16]; +#else unsigned char uleft_col[8]; unsigned char vleft_col[8]; +#endif int i; intra_pred_fn fn; diff --git a/libs/libvpx/vp8/common/reconintra4x4.c b/libs/libvpx/vp8/common/reconintra4x4.c index 07c9223331..64d33a2873 100644 --- a/libs/libvpx/vp8/common/reconintra4x4.c +++ b/libs/libvpx/vp8/common/reconintra4x4.c @@ -31,7 +31,7 @@ void vp8_init_intra4x4_predictors_internal(void) { pred[B_LD_PRED] = vpx_d45e_predictor_4x4; pred[B_RD_PRED] = vpx_d135_predictor_4x4; pred[B_VR_PRED] = vpx_d117_predictor_4x4; - pred[B_VL_PRED] = vpx_d63f_predictor_4x4; + pred[B_VL_PRED] = vpx_d63e_predictor_4x4; pred[B_HD_PRED] = vpx_d153_predictor_4x4; pred[B_HU_PRED] = vpx_d207_predictor_4x4; } @@ -40,7 +40,15 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left) { - unsigned char Aboveb[12], *Above = Aboveb + 4; +/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from + Above (aka, Aboveb + 4). Play it safe by reserving enough stack + space here. Similary for "Left". */ +#if HAVE_VSX + unsigned char Aboveb[20]; +#else + unsigned char Aboveb[12]; +#endif + unsigned char *Above = Aboveb + 4; #if HAVE_NEON // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it // over reads but does not use the extra 4 values. @@ -50,6 +58,8 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, // indeed read, they are not used. vp8_zero_array(Left, 8); #endif // VPX_WITH_ASAN +#elif HAVE_VSX + unsigned char Left[16]; #else unsigned char Left[4]; #endif // HAVE_NEON diff --git a/libs/libvpx/vp8/common/rtcd_defs.pl b/libs/libvpx/vp8/common/rtcd_defs.pl index 8dc36f7317..3df745f75a 100644 --- a/libs/libvpx/vp8/common/rtcd_defs.pl +++ b/libs/libvpx/vp8/common/rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vp8_common_forward_decls() { print < #include -#define thread_sleep(nms) {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} +#define thread_sleep(nms) sched_yield(); +/* {struct timespec ts;ts.tv_sec=0; + ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ #endif /* Not Windows. Assume pthreads */ @@ -188,6 +192,16 @@ static inline int sem_destroy(sem_t *sem) { #endif #include "vpx_util/vpx_thread.h" +#include "vpx_util/vpx_atomics.h" + +static INLINE void vp8_atomic_spin_wait( + int mb_col, const vpx_atomic_int *last_row_current_mb_col, + const int nsync) { + while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) { + x86_pause_hint(); + thread_sleep(0); + } +} #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ diff --git a/libs/libvpx/vp8/common/vp8_loopfilter.c b/libs/libvpx/vp8/common/vp8_loopfilter.c index c6430be465..9fb1250650 100644 --- a/libs/libvpx/vp8/common/vp8_loopfilter.c +++ b/libs/libvpx/vp8/common/vp8_loopfilter.c @@ -111,11 +111,9 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, /* Note the baseline filter values for each segment */ if (mbd->segmentation_enabled) { - /* Abs value */ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) { lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; - } else /* Delta Value */ - { + } else { /* Delta Value */ lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; } lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0; @@ -344,8 +342,7 @@ void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) { mode_info_context++; /* Skip border mb */ } - } else /* SIMPLE_LOOPFILTER */ - { + } else { /* SIMPLE_LOOPFILTER */ for (mb_row = 0; mb_row < mb_rows; ++mb_row) { for (mb_col = 0; mb_col < mb_cols; ++mb_col) { int skip_lf = (mode_info_context->mbmi.mode != B_PRED && diff --git a/libs/libvpx/vp8/common/vp8_skin_detection.c b/libs/libvpx/vp8/common/vp8_skin_detection.c new file mode 100644 index 0000000000..6739efa5fe --- /dev/null +++ b/libs/libvpx/vp8/common/vp8_skin_detection.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/alloccommon.h" +#include "vp8/common/vp8_skin_detection.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +static int avg_2x2(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 2; ++i, s += p) { + for (j = 0; j < 2; ++j) { + sum += s[j]; + } + } + return (sum + 2) >> 2; +} + +int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, + SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv, + int curr_motion_magn) { + // No skin if block has been zero/small motion for long consecutive time. + if (consec_zeromv > 60 && curr_motion_magn == 0) { + return 0; + } else { + int motion = 1; + if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0; + if (bsize == SKIN_16X16) { + // Take the average of center 2x2 pixels. + const int ysource = avg_2x2(y + 7 * stride + 7, stride); + const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv); + const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv); + return vpx_skin_pixel(ysource, usource, vsource, motion); + } else { + int num_skin = 0; + int i, j; + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + // Take the average of center 2x2 pixels. + const int ysource = avg_2x2(y + 3 * stride + 3, stride); + const int usource = avg_2x2(u + strideuv + 1, strideuv); + const int vsource = avg_2x2(v + strideuv + 1, strideuv); + num_skin += vpx_skin_pixel(ysource, usource, vsource, motion); + if (num_skin >= 2) return 1; + y += 8; + u += 4; + v += 4; + } + y += (stride << 3) - 16; + u += (strideuv << 2) - 8; + v += (strideuv << 2) - 8; + } + + return 0; + } + } +} + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) { + int i, j, mb_row, mb_col, num_bl; + VP8_COMMON *const cm = &cpi->common; + uint8_t *y; + const uint8_t *src_y = cpi->Source->y_buffer; + const int src_ystride = cpi->Source->y_stride; + int offset = 0; + + YV12_BUFFER_CONFIG skinmap; + memset(&skinmap, 0, sizeof(skinmap)); + if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height, + VP8BORDERINPIXELS) < 0) { + vpx_free_frame_buffer(&skinmap); + return; + } + memset(skinmap.buffer_alloc, 128, skinmap.frame_size); + y = skinmap.y_buffer; + // Loop through blocks and set skin map based on center pixel of block. + // Set y to white for skin block, otherwise set to source with gray scale. + for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) { + num_bl = 0; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) { + const int is_skin = cpi->skin_map[offset++]; + for (i = 0; i < 16; i++) { + for (j = 0; j < 16; j++) { + y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j]; + } + } + num_bl++; + y += 16; + src_y += 16; + } + y += (src_ystride << 4) - (num_bl << 4); + src_y += (src_ystride << 4) - (num_bl << 4); + } + vpx_write_yuv_frame(yuv_skinmap_file, &skinmap); + vpx_free_frame_buffer(&skinmap); +} +#endif // OUTPUT_YUV_SKINMAP diff --git a/libs/libvpx/vp8/common/vp8_skin_detection.h b/libs/libvpx/vp8/common/vp8_skin_detection.h new file mode 100644 index 0000000000..4d27f5eb2e --- /dev/null +++ b/libs/libvpx/vp8/common/vp8_skin_detection.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_SKIN_DETECTION_H_ +#define VP8_COMMON_SKIN_DETECTION_H_ + +#include "vp8/encoder/onyx_int.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/skin_detection.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +typedef enum { + // Skin detection based on 8x8 block. If two of them are identified as skin, + // the macroblock is marked as skin. + SKIN_8X8, + // Skin detection based on 16x16 block. + SKIN_16X16 +} SKIN_DETECTION_BLOCK_SIZE; + +int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, + SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv, + int curr_motion_magn); + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SKIN_DETECTION_H_ diff --git a/libs/libvpx/vp8/common/x86/copy_sse2.asm b/libs/libvpx/vp8/common/x86/copy_sse2.asm index 86fae26956..480faa2550 100644 --- a/libs/libvpx/vp8/common/x86/copy_sse2.asm +++ b/libs/libvpx/vp8/common/x86/copy_sse2.asm @@ -11,6 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void vp8_copy32xn_sse2( ; unsigned char *src_ptr, diff --git a/libs/libvpx/vp8/common/x86/copy_sse3.asm b/libs/libvpx/vp8/common/x86/copy_sse3.asm index d789a40ccf..31ea898a3d 100644 --- a/libs/libvpx/vp8/common/x86/copy_sse3.asm +++ b/libs/libvpx/vp8/common/x86/copy_sse3.asm @@ -83,6 +83,7 @@ ret %endmacro +SECTION .text ;void vp8_copy32xn_sse3( ; unsigned char *src_ptr, diff --git a/libs/libvpx/vp8/common/x86/dequantize_mmx.asm b/libs/libvpx/vp8/common/x86/dequantize_mmx.asm index 4e551f00aa..bfdd997782 100644 --- a/libs/libvpx/vp8/common/x86/dequantize_mmx.asm +++ b/libs/libvpx/vp8/common/x86/dequantize_mmx.asm @@ -11,6 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) global sym(vp8_dequantize_b_impl_mmx) PRIVATE diff --git a/libs/libvpx/vp8/common/x86/idct_blk_mmx.c b/libs/libvpx/vp8/common/x86/idct_blk_mmx.c index 6194cdf8fa..fd804b1ca4 100644 --- a/libs/libvpx/vp8/common/x86/idct_blk_mmx.c +++ b/libs/libvpx/vp8/common/x86/idct_blk_mmx.c @@ -21,91 +21,3 @@ void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) { vp8_dequantize_b_impl_mmx(sq, dq, DQC); } - -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, - int stride, char *eobs) { - int i; - - for (i = 0; i < 4; ++i) { - if (eobs[0] > 1) { - vp8_dequant_idct_add_mmx(q, dq, dst, stride); - } else if (eobs[0] == 1) { - vp8_dc_only_idct_add_mmx(q[0] * dq[0], dst, stride, dst, stride); - memset(q, 0, 2 * sizeof(q[0])); - } - - if (eobs[1] > 1) { - vp8_dequant_idct_add_mmx(q + 16, dq, dst + 4, stride); - } else if (eobs[1] == 1) { - vp8_dc_only_idct_add_mmx(q[16] * dq[0], dst + 4, stride, dst + 4, stride); - memset(q + 16, 0, 2 * sizeof(q[0])); - } - - if (eobs[2] > 1) { - vp8_dequant_idct_add_mmx(q + 32, dq, dst + 8, stride); - } else if (eobs[2] == 1) { - vp8_dc_only_idct_add_mmx(q[32] * dq[0], dst + 8, stride, dst + 8, stride); - memset(q + 32, 0, 2 * sizeof(q[0])); - } - - if (eobs[3] > 1) { - vp8_dequant_idct_add_mmx(q + 48, dq, dst + 12, stride); - } else if (eobs[3] == 1) { - vp8_dc_only_idct_add_mmx(q[48] * dq[0], dst + 12, stride, dst + 12, - stride); - memset(q + 48, 0, 2 * sizeof(q[0])); - } - - q += 64; - dst += 4 * stride; - eobs += 4; - } -} - -void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dstu, - unsigned char *dstv, int stride, - char *eobs) { - int i; - - for (i = 0; i < 2; ++i) { - if (eobs[0] > 1) { - vp8_dequant_idct_add_mmx(q, dq, dstu, stride); - } else if (eobs[0] == 1) { - vp8_dc_only_idct_add_mmx(q[0] * dq[0], dstu, stride, dstu, stride); - memset(q, 0, 2 * sizeof(q[0])); - } - - if (eobs[1] > 1) { - vp8_dequant_idct_add_mmx(q + 16, dq, dstu + 4, stride); - } else if (eobs[1] == 1) { - vp8_dc_only_idct_add_mmx(q[16] * dq[0], dstu + 4, stride, dstu + 4, - stride); - memset(q + 16, 0, 2 * sizeof(q[0])); - } - - q += 32; - dstu += 4 * stride; - eobs += 2; - } - - for (i = 0; i < 2; ++i) { - if (eobs[0] > 1) { - vp8_dequant_idct_add_mmx(q, dq, dstv, stride); - } else if (eobs[0] == 1) { - vp8_dc_only_idct_add_mmx(q[0] * dq[0], dstv, stride, dstv, stride); - memset(q, 0, 2 * sizeof(q[0])); - } - - if (eobs[1] > 1) { - vp8_dequant_idct_add_mmx(q + 16, dq, dstv + 4, stride); - } else if (eobs[1] == 1) { - vp8_dc_only_idct_add_mmx(q[16] * dq[0], dstv + 4, stride, dstv + 4, - stride); - memset(q + 16, 0, 2 * sizeof(q[0])); - } - - q += 32; - dstv += 4 * stride; - eobs += 2; - } -} diff --git a/libs/libvpx/vp8/common/x86/idctllm_mmx.asm b/libs/libvpx/vp8/common/x86/idctllm_mmx.asm index 96fa2c60d0..5773d9d84e 100644 --- a/libs/libvpx/vp8/common/x86/idctllm_mmx.asm +++ b/libs/libvpx/vp8/common/x86/idctllm_mmx.asm @@ -31,6 +31,7 @@ ; * ; **************************************************************************/ +SECTION .text ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, ;int pitch, unsigned char *dest,int stride) diff --git a/libs/libvpx/vp8/common/x86/idctllm_sse2.asm b/libs/libvpx/vp8/common/x86/idctllm_sse2.asm index bf8e2c4021..560faba000 100644 --- a/libs/libvpx/vp8/common/x86/idctllm_sse2.asm +++ b/libs/libvpx/vp8/common/x86/idctllm_sse2.asm @@ -19,6 +19,8 @@ ; int dst_stride - 3 ; ) +SECTION .text + global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE sym(vp8_idct_dequant_0_2x_sse2): push rbp diff --git a/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm b/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm deleted file mode 100644 index 158c3b7458..0000000000 --- a/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm +++ /dev/null @@ -1,140 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) -global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE -sym(vp8_short_inv_walsh4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - ; end prolog - - mov rdx, arg(0) - mov rax, 30003h - - movq mm0, [rdx + 0] ;ip[0] - movq mm1, [rdx + 8] ;ip[4] - movq mm7, rax - - movq mm2, [rdx + 16] ;ip[8] - movq mm3, [rdx + 24] ;ip[12] - punpcklwd mm7, mm7 ;0003000300030003h - mov rdx, arg(1) - - movq mm4, mm0 - movq mm5, mm1 - - paddw mm4, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm4 ;temp al - paddw mm4, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm1, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - paddw mm0, mm1 ;dl + cl - psubw mm5, mm1 ;dl - cl - - ; 03 02 01 00 - ; 13 12 11 10 - ; 23 22 21 20 - ; 33 32 31 30 - - movq mm3, mm4 ; 03 02 01 00 - punpcklwd mm4, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 - - movq mm1, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm1, mm5 ; 33 23 32 22 - - movq mm0, mm4 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 - - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] - - punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] -;~~~~~~~~~~~~~~~~~~~~~ - movq mm1, mm0 - movq mm5, mm4 - paddw mm1, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm1 ;temp al - paddw mm1, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - paddw mm1, mm7 - paddw mm6, mm7 - psraw mm1, 3 - psraw mm6, 3 - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm4, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - paddw mm0, mm4 ;dl + cl - psubw mm5, mm4 ;dl - cl - paddw mm0, mm7 - paddw mm5, mm7 - psraw mm0, 3 - psraw mm5, 3 -;~~~~~~~~~~~~~~~~~~~~~ - - movd eax, mm1 - movd ecx, mm0 - psrlq mm0, 32 - psrlq mm1, 32 - mov word ptr[rdx+32*0], ax - mov word ptr[rdx+32*1], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*4], ax - mov word ptr[rdx+32*5], cx - movd eax, mm1 - movd ecx, mm0 - mov word ptr[rdx+32*8], ax - mov word ptr[rdx+32*9], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*12], ax - mov word ptr[rdx+32*13], cx - - movd eax, mm6 - movd ecx, mm5 - psrlq mm5, 32 - psrlq mm6, 32 - mov word ptr[rdx+32*2], ax - mov word ptr[rdx+32*3], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*6], ax - mov word ptr[rdx+32*7], cx - movd eax, mm6 - movd ecx, mm5 - mov word ptr[rdx+32*10], ax - mov word ptr[rdx+32*11], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*14], ax - mov word ptr[rdx+32*15], cx - - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - diff --git a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm index 06e86a80b6..82d7bf91a6 100644 --- a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm +++ b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE sym(vp8_short_inv_walsh4x4_sse2): diff --git a/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm index 6d5aaa19db..6a3d05290f 100644 --- a/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm +++ b/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm @@ -125,6 +125,8 @@ pxor %1, [GLOBAL(t80)] %endmacro +SECTION .text + ;void vp8_loop_filter_bh_y_sse2 ;( ; unsigned char *src_ptr, diff --git a/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm index 1913abc69b..2ae028feae 100644 --- a/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm +++ b/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -276,6 +276,8 @@ %endmacro +SECTION .text + %if ABI_IS_32BIT ;void vp8_loop_filter_horizontal_edge_sse2 diff --git a/libs/libvpx/vp8/common/x86/loopfilter_x86.c b/libs/libvpx/vp8/common/x86/loopfilter_x86.c index aa2f8f1acf..a187d51fbe 100644 --- a/libs/libvpx/vp8/common/x86/loopfilter_x86.c +++ b/libs/libvpx/vp8/common/x86/loopfilter_x86.c @@ -22,13 +22,6 @@ #define prototype_simple_loopfilter(sym) \ void sym(unsigned char *y, int ystride, const unsigned char *blimit) -prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); -prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); -prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); -prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); -prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); -prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); - #if HAVE_SSE2 && ARCH_X86_64 prototype_loopfilter(vp8_loop_filter_bv_y_sse2); prototype_loopfilter(vp8_loop_filter_bh_y_sse2); @@ -44,105 +37,6 @@ extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2; extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; -#if HAVE_MMX -/* Horizontal MB filtering */ -void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); - - if (u_ptr) { - vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - } - - if (v_ptr) { - vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - } -} - -/* Vertical MB Filtering */ -void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); - - if (u_ptr) { - vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - } - - if (v_ptr) { - vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - } -} - -/* Horizontal B Filtering */ -void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) { - vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); - } - - if (v_ptr) { - vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); - } -} - -void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, - blimit); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, - blimit); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, - blimit); -} - -/* Vertical B Filtering */ -void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - - if (u_ptr) { - vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 1); - } - - if (v_ptr) { - vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 1); - } -} - -void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); -} -#endif - /* Horizontal MB filtering */ #if HAVE_SSE2 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, diff --git a/libs/libvpx/vp8/common/x86/mfqe_sse2.asm b/libs/libvpx/vp8/common/x86/mfqe_sse2.asm index 8177b79226..3fde973ad2 100644 --- a/libs/libvpx/vp8/common/x86/mfqe_sse2.asm +++ b/libs/libvpx/vp8/common/x86/mfqe_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp8_filter_by_weight16x16_sse2 ;( ; unsigned char *src, diff --git a/libs/libvpx/vp8/common/x86/recon_mmx.asm b/libs/libvpx/vp8/common/x86/recon_mmx.asm index 15e98713c7..e6a48f6b07 100644 --- a/libs/libvpx/vp8/common/x86/recon_mmx.asm +++ b/libs/libvpx/vp8/common/x86/recon_mmx.asm @@ -11,6 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void copy_mem8x8_mmx( ; unsigned char *src, @@ -117,158 +118,3 @@ sym(vp8_copy_mem8x4_mmx): UNSHADOW_ARGS pop rbp ret - - -;void copy_mem16x16_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp8_copy_mem16x16_mmx) PRIVATE -sym(vp8_copy_mem16x16_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movsxd rax, dword ptr arg(1) ;src_stride; - - mov rdi, arg(2) ;dst; - movsxd rcx, dword ptr arg(3) ;dst_stride - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq [rdi], mm0 - movq [rdi+8], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/libs/libvpx/vp8/common/x86/recon_sse2.asm b/libs/libvpx/vp8/common/x86/recon_sse2.asm index cb89537f76..57f8899c77 100644 --- a/libs/libvpx/vp8/common/x86/recon_sse2.asm +++ b/libs/libvpx/vp8/common/x86/recon_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void copy_mem16x16_sse2( ; unsigned char *src, ; int src_stride, diff --git a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm index 47dd452297..1f3a2baca0 100644 --- a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm +++ b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -17,6 +17,7 @@ extern sym(vp8_bilinear_filters_x86_8) %define vp8_filter_weight 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;void vp8_filter_block1d_h6_mmx ;( @@ -204,163 +205,6 @@ sym(vp8_filter_block1dc_v6_mmx): ret -;void bilinear_predict8x8_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x8_mmx) PRIVATE -sym(vp8_bilinear_predict8x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - shl rax, 5 ; offset * 32 - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - - add rax, rcx ; HFilter - mov rsi, arg(0) ;src_ptr ; - - movsxd rdx, dword ptr arg(5) ;dst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - - shl rax, 5 ; offset*32 - add rax, rcx ; VFilter - - lea rcx, [rdi+rdx*8] ; - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - - - ; get the first horizontal line done ; - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - add rsi, rdx ; next line -.next_row_8x8: - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - movq mm5, mm7 ; - movq mm6, mm7 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 - - pmullw mm5, [rax] ; - pmullw mm6, [rax] ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - - pmullw mm3, [rax+16] ; - pmullw mm4, [rax+16] ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - packuswb mm3, mm4 - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch - add rsi, rdx ; next line - add rdi, r8 ;dst_pitch -%endif - cmp rdi, rcx ; - jne .next_row_8x8 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - ;void bilinear_predict8x4_mmx ;( ; unsigned char *src_ptr, @@ -641,8 +485,8 @@ rd: times 4 dw 0x40 align 16 -global HIDDEN_DATA(sym(vp8_six_tap_mmx)) -sym(vp8_six_tap_mmx): +global HIDDEN_DATA(sym(vp8_six_tap_x86)) +sym(vp8_six_tap_x86): times 8 dw 0 times 8 dw 0 times 8 dw 128 diff --git a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm index ca00583ca2..6e70f6d2e8 100644 --- a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm +++ b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm @@ -16,6 +16,7 @@ extern sym(vp8_bilinear_filters_x86_8) %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;/************************************************************************************ ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The diff --git a/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm index 1f6cbd1d1e..8d55c9320c 100644 --- a/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm +++ b/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm @@ -15,6 +15,7 @@ %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;/************************************************************************************ ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The diff --git a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c index af2c1e7633..b9d087e20d 100644 --- a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c +++ b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c @@ -13,7 +13,7 @@ #include "vpx_ports/mem.h" #include "filter_x86.h" -extern const short vp8_six_tap_mmx[8][6 * 8]; +extern const short vp8_six_tap_x86[8][6 * 8]; extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, @@ -82,103 +82,13 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, DECLARE_ALIGNED(16, unsigned short, FData2[16 * 16]); /* Temp data bufffer used in filtering */ const short *HFilter, *VFilter; - HFilter = vp8_six_tap_mmx[xoffset]; + HFilter = vp8_six_tap_x86[xoffset]; vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); - VFilter = vp8_six_tap_mmx[yoffset]; + VFilter = vp8_six_tap_x86[yoffset]; vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4, VFilter); } - -void vp8_sixtap_predict16x16_mmx(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED(16, unsigned short, - FData2[24 * 24]); /* Temp data bufffer used in filtering */ - - const short *HFilter, *VFilter; - - HFilter = vp8_six_tap_mmx[xoffset]; - - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, - src_pixels_per_line, 1, 21, 32, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, - src_pixels_per_line, 1, 21, 32, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, - src_pixels_per_line, 1, 21, 32, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, - FData2 + 12, src_pixels_per_line, 1, 21, 32, - HFilter); - - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 16, - VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16, 16, - 16, VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16, 16, - 16, VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16, 16, - 16, VFilter); -} - -void vp8_sixtap_predict8x8_mmx(unsigned char *src_ptr, int src_pixels_per_line, - int xoffset, int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED(16, unsigned short, - FData2[256]); /* Temp data bufffer used in filtering */ - - const short *HFilter, *VFilter; - - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, - src_pixels_per_line, 1, 13, 16, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, - src_pixels_per_line, 1, 13, 16, HFilter); - - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, 8, - VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 8, 8, - VFilter); -} - -void vp8_sixtap_predict8x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, - int xoffset, int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED(16, unsigned short, - FData2[256]); /* Temp data bufffer used in filtering */ - - const short *HFilter, *VFilter; - - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, - src_pixels_per_line, 1, 9, 16, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, - src_pixels_per_line, 1, 9, 16, HFilter); - - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, 8, - VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 4, 8, - VFilter); -} - -void vp8_bilinear_predict16x16_mmx(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, - dst_ptr, dst_pitch); - vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 8, dst_pitch); - vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, - src_pixels_per_line, xoffset, yoffset, - dst_ptr + dst_pitch * 8, dst_pitch); - vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, - src_pixels_per_line, xoffset, yoffset, - dst_ptr + dst_pitch * 8 + 8, dst_pitch); -} #endif #if HAVE_SSE2 @@ -195,21 +105,21 @@ void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, if (xoffset) { if (yoffset) { - HFilter = vp8_six_tap_mmx[xoffset]; + HFilter = vp8_six_tap_x86[xoffset]; vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); - VFilter = vp8_six_tap_mmx[yoffset]; + VFilter = vp8_six_tap_x86[yoffset]; vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, dst_pitch, VFilter); } else { /* First-pass only */ - HFilter = vp8_six_tap_mmx[xoffset]; + HFilter = vp8_six_tap_x86[xoffset]; vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); } } else { /* Second-pass only */ - VFilter = vp8_six_tap_mmx[yoffset]; + VFilter = vp8_six_tap_x86[yoffset]; vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, @@ -226,21 +136,21 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, if (xoffset) { if (yoffset) { - HFilter = vp8_six_tap_mmx[xoffset]; + HFilter = vp8_six_tap_x86[xoffset]; vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); - VFilter = vp8_six_tap_mmx[yoffset]; + VFilter = vp8_six_tap_x86[yoffset]; vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, dst_pitch, VFilter); } else { /* First-pass only */ - HFilter = vp8_six_tap_mmx[xoffset]; + HFilter = vp8_six_tap_x86[xoffset]; vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); } } else { /* Second-pass only */ - VFilter = vp8_six_tap_mmx[yoffset]; + VFilter = vp8_six_tap_x86[yoffset]; vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); @@ -256,21 +166,21 @@ void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, if (xoffset) { if (yoffset) { - HFilter = vp8_six_tap_mmx[xoffset]; + HFilter = vp8_six_tap_x86[xoffset]; vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); - VFilter = vp8_six_tap_mmx[yoffset]; + VFilter = vp8_six_tap_x86[yoffset]; vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, dst_pitch, VFilter); } else { /* First-pass only */ - HFilter = vp8_six_tap_mmx[xoffset]; + HFilter = vp8_six_tap_x86[xoffset]; vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); } } else { /* Second-pass only */ - VFilter = vp8_six_tap_mmx[yoffset]; + VFilter = vp8_six_tap_x86[yoffset]; vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); diff --git a/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm b/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm deleted file mode 100644 index 88a07b9f3f..0000000000 --- a/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm +++ /dev/null @@ -1,1753 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -;void vp8_loop_filter_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE -sym(vp8_loop_filter_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - movsxd rcx, dword ptr arg(5) ;count -.next8_h: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - ; calculate breakout conditions - movq mm2, [rdi+2*rax] ; q3 - movq mm1, [rsi+2*rax] ; q2 - movq mm6, mm1 ; q2 - psubusb mm1, mm2 ; q2-=q3 - psubusb mm2, mm6 ; q3-=q2 - por mm1, mm2 ; abs(q3-q2) - psubusb mm1, mm7 ; - - - movq mm4, [rsi+rax] ; q1 - movq mm3, mm4 ; q1 - psubusb mm4, mm6 ; q1-=q2 - psubusb mm6, mm3 ; q2-=q1 - por mm4, mm6 ; abs(q2-q1) - - psubusb mm4, mm7 - por mm1, mm4 - - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - psubusb mm4, mm3 ; q0-=q1 - psubusb mm3, mm0 ; q1-=q0 - por mm4, mm3 ; abs(q0-q1) - movq t0, mm4 ; save to t0 - psubusb mm4, mm7 - por mm1, mm4 - - - neg rax ; negate pitch to deal with above border - - movq mm2, [rsi+4*rax] ; p3 - movq mm4, [rdi+4*rax] ; p2 - movq mm5, mm4 ; p2 - psubusb mm4, mm2 ; p2-=p3 - psubusb mm2, mm5 ; p3-=p2 - por mm4, mm2 ; abs(p3 - p2) - psubusb mm4, mm7 - por mm1, mm4 - - - movq mm4, [rsi+2*rax] ; p1 - movq mm3, mm4 ; p1 - psubusb mm4, mm5 ; p1-=p2 - psubusb mm5, mm3 ; p2-=p1 - por mm4, mm5 ; abs(p2 - p1) - psubusb mm4, mm7 - por mm1, mm4 - - movq mm2, mm3 ; p1 - - movq mm4, [rsi+rax] ; p0 - movq mm5, mm4 ; p0 - psubusb mm4, mm3 ; p0-=p1 - psubusb mm3, mm5 ; p1-=p0 - por mm4, mm3 ; abs(p1 - p0) - movq t1, mm4 ; save to t1 - psubusb mm4, mm7 - por mm1, mm4 - - movq mm3, [rdi] ; q1 - movq mm4, mm3 ; q1 - psubusb mm3, mm2 ; q1-=p1 - psubusb mm2, mm4 ; p1-=q1 - por mm2, mm3 ; abs(p1-q1) - pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm2, 1 ; abs(p1-q1)/2 - - movq mm6, mm5 ; p0 - movq mm3, [rsi] ; q0 - psubusb mm5, mm3 ; p0-=q0 - psubusb mm3, mm6 ; q0-=p0 - por mm5, mm3 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] ; blimit - - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm5 - pxor mm5, mm5 - pcmpeqb mm1, mm5 ; mask mm1 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb mm4, mm5 - - pcmpeqb mm5, mm5 - pxor mm4, mm5 - - - ; start work on filters - movq mm2, [rsi+2*rax] ; p1 - movq mm7, [rdi] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - pand mm2, mm4 ; high var mask (hvm)(p1 - q1) - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - movq mm2, mm1 - paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - pxor mm0, mm0 ; - pxor mm5, mm5 - punpcklbw mm0, mm2 ; - punpckhbw mm5, mm2 ; - psraw mm0, 11 ; - psraw mm5, 11 - packsswb mm0, mm5 - movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - - pxor mm0, mm0 ; 0 - movq mm5, mm1 ; abcdefgh - punpcklbw mm0, mm1 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - pxor mm1, mm1 ; 0 - punpckhbw mm1, mm5 ; a0b0c0d0 - psraw mm1, 11 ; sign extended shift right by 3 - movq mm5, mm0 ; save results - - packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw mm5, [GLOBAL(ones)] - paddsw mm1, [GLOBAL(ones)] - psraw mm5, 1 ; partial shifted one more time for 2nd tap - psraw mm1, 1 ; partial shifted one more time for 2nd tap - packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - pandn mm4, mm5 ; high edge variance additive - - paddsb mm6, mm2 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+rax], mm6 ; write back - - movq mm6, [rsi+2*rax] ; p1 - pxor mm6, [GLOBAL(t80)] ; reoffset - paddsb mm6, mm4 ; p1+= p1 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+2*rax], mm6 ; write back - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - movq [rsi], mm3 ; write back - - psubsb mm7, mm4 ; q1-= q1 add - pxor mm7, [GLOBAL(t80)] ; unoffset - movq [rdi], mm7 ; write back - - add rsi,8 - neg rax - dec rcx - jnz .next8_h - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_loop_filter_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE -sym(vp8_loop_filter_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 64 ; reserve 64 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4 - 4] - - movsxd rcx, dword ptr arg(5) ;count -.next8_v: - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - - ;transpose - movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 - movq mm7, mm6 ; 77 76 75 74 73 72 71 70 - - punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 - punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 - - movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 - movq mm5, mm4 ; 47 46 45 44 43 42 41 40 - - punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 - punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 - - movq mm3, mm5 ; 57 47 56 46 55 45 54 44 - punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 - - punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 - movq mm2, mm4 ; 53 43 52 42 51 41 50 40 - - punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 - punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 - - neg rax - movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 - - movq mm1, mm6 ; 27 26 25 24 23 22 21 20 - punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 - - punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 - movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 - - punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 - movq mm0, mm7 ; 17 07 16 06 15 05 14 04 - - punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 - punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 - - movq mm6, mm7 ; 37 27 17 07 36 26 16 06 - punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 - - punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 - - movq mm5, mm6 ; 76 66 56 46 36 26 16 06 - psubusb mm5, mm7 ; q2-q3 - - psubusb mm7, mm6 ; q3-q2 - por mm7, mm5; ; mm7=abs (q3-q2) - - movq mm5, mm0 ; 35 25 15 05 34 24 14 04 - punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 - - punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 - movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 - - psubusb mm3, mm6 ; q1-q2 - psubusb mm6, mm5 ; q2-q1 - - por mm6, mm3 ; mm6=abs(q2-q1) - lea rdx, srct - - movq [rdx+24], mm5 ; save q1 - movq [rdx+16], mm0 ; save q0 - - movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 - punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 - - movq mm0, mm3 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 31 21 11 01 30 20 10 00 - - punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 - punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 - - movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 - psubusb mm2, mm0 ; p2-p3 - - psubusb mm0, mm1 ; p3-p2 - por mm0, mm2 ; mm0=abs(p3-p2) - - movq mm2, mm3 ; 33 23 13 03 32 22 12 02 - punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 - - punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 - movq [rdx+8], mm3 ; save p0 - - movq [rdx], mm2 ; save p1 - movq mm5, mm2 ; mm5 = p1 - - psubusb mm2, mm1 ; p1-p2 - psubusb mm1, mm5 ; p2-p1 - - por mm1, mm2 ; mm1=abs(p2-p1) - mov rdx, arg(3) ;limit - - movq mm4, [rdx] ; mm4 = limit - psubusb mm7, mm4 - - psubusb mm0, mm4 - psubusb mm1, mm4 - - psubusb mm6, mm4 - por mm7, mm6 - - por mm0, mm1 - por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit - - movq mm1, mm5 ; p1 - - movq mm7, mm3 ; mm3=mm7=p0 - psubusb mm7, mm5 ; p0 - p1 - - psubusb mm5, mm3 ; p1 - p0 - por mm5, mm7 ; abs(p1-p0) - - movq t0, mm5 ; save abs(p1-p0) - lea rdx, srct - - psubusb mm5, mm4 - por mm0, mm5 ; mm0=mask - - movq mm5, [rdx+16] ; mm5=q0 - movq mm7, [rdx+24] ; mm7=q1 - - movq mm6, mm5 ; mm6=q0 - movq mm2, mm7 ; q1 - psubusb mm5, mm7 ; q0-q1 - - psubusb mm7, mm6 ; q1-q0 - por mm7, mm5 ; abs(q1-q0) - - movq t1, mm7 ; save abs(q1-q0) - psubusb mm7, mm4 - - por mm0, mm7 ; mask - - movq mm5, mm2 ; q1 - psubusb mm5, mm1 ; q1-=p1 - psubusb mm1, mm2 ; p1-=q1 - por mm5, mm1 ; abs(p1-q1) - pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm5, 1 ; abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; - - movq mm4, [rdx] ;blimit - movq mm1, mm3 ; mm1=mm3=p0 - - movq mm7, mm6 ; mm7=mm6=q0 - psubusb mm1, mm7 ; p0-q0 - - psubusb mm7, mm3 ; q0-p0 - por mm1, mm7 ; abs(q0-p0) - paddusb mm1, mm1 ; abs(q0-p0)*2 - paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm0; ; mask - - pxor mm0, mm0 - pcmpeqb mm1, mm0 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] - ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - - por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - pcmpeqb mm4, mm0 - - pcmpeqb mm0, mm0 - pxor mm4, mm0 - - - - ; start work on filters - lea rdx, srct - - movq mm2, [rdx] ; p1 - movq mm7, [rdx+24] ; q1 - - movq mm6, [rdx+8] ; p0 - movq mm0, [rdx+16] ; q0 - - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb mm2, mm7 ; p1 - q1 - pand mm2, mm4 ; high var mask (hvm)(p1 - q1) - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - - paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - - paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - movq mm2, mm1 - paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - - paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - pxor mm0, mm0 ; - - pxor mm5, mm5 - punpcklbw mm0, mm2 ; - - punpckhbw mm5, mm2 ; - psraw mm0, 11 ; - - psraw mm5, 11 - packsswb mm0, mm5 - - movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - - pxor mm0, mm0 ; 0 - movq mm5, mm1 ; abcdefgh - - punpcklbw mm0, mm1 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - - pxor mm1, mm1 ; 0 - punpckhbw mm1, mm5 ; a0b0c0d0 - - psraw mm1, 11 ; sign extended shift right by 3 - movq mm5, mm0 ; save results - - packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw mm5, [GLOBAL(ones)] - - paddsw mm1, [GLOBAL(ones)] - psraw mm5, 1 ; partial shifted one more time for 2nd tap - - psraw mm1, 1 ; partial shifted one more time for 2nd tap - packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - - pandn mm4, mm5 ; high edge variance additive - - paddsb mm6, mm2 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - - ; mm6=p0 ; - movq mm1, [rdx] ; p1 - pxor mm1, [GLOBAL(t80)] ; reoffset - - paddsb mm1, mm4 ; p1+= p1 add - pxor mm1, [GLOBAL(t80)] ; unoffset - ; mm6 = p0 mm1 = p1 - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - - ; mm3 = q0 - psubsb mm7, mm4 ; q1-= q1 add - pxor mm7, [GLOBAL(t80)] ; unoffset - ; mm7 = q1 - - ; transpose and write back - ; mm1 = 72 62 52 42 32 22 12 02 - ; mm6 = 73 63 53 43 33 23 13 03 - ; mm3 = 74 64 54 44 34 24 14 04 - ; mm7 = 75 65 55 45 35 25 15 05 - - movq mm2, mm1 ; 72 62 52 42 32 22 12 02 - punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 - - movq mm4, mm3 ; 74 64 54 44 34 24 14 04 - punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 - - punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 - punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 - - movq mm6, mm2 ; 33 32 23 22 13 12 03 02 - punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 - - punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 - movq mm5, mm1 ; 73 72 63 62 53 52 43 42 - - punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 - punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 - - - ; mm2 = 15 14 13 12 05 04 03 02 - ; mm6 = 35 34 33 32 25 24 23 22 - ; mm5 = 55 54 53 52 45 44 43 42 - ; mm1 = 75 74 73 72 65 64 63 62 - - - - movd [rsi+rax*4+2], mm2 - psrlq mm2, 32 - - movd [rdi+rax*4+2], mm2 - movd [rsi+rax*2+2], mm6 - - psrlq mm6, 32 - movd [rsi+rax+2],mm6 - - movd [rsi+2], mm1 - psrlq mm1, 32 - - movd [rdi+2], mm1 - neg rax - - movd [rdi+rax+2],mm5 - psrlq mm5, 32 - - movd [rdi+rax*2+2], mm5 - - lea rsi, [rsi+rax*8] - dec rcx - jnz .next8_v - - add rsp, 64 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE -sym(vp8_mbloop_filter_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - movsxd rcx, dword ptr arg(5) ;count -.next8_mbh: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - ; calculate breakout conditions - movq mm2, [rdi+2*rax] ; q3 - - movq mm1, [rsi+2*rax] ; q2 - movq mm6, mm1 ; q2 - psubusb mm1, mm2 ; q2-=q3 - psubusb mm2, mm6 ; q3-=q2 - por mm1, mm2 ; abs(q3-q2) - psubusb mm1, mm7 - - - ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit - movq mm4, [rsi+rax] ; q1 - movq mm3, mm4 ; q1 - psubusb mm4, mm6 ; q1-=q2 - psubusb mm6, mm3 ; q2-=q1 - por mm4, mm6 ; abs(q2-q1) - psubusb mm4, mm7 - por mm1, mm4 - - - ; mm1 = mask, mm3=q1, mm7 = limit - - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - psubusb mm4, mm3 ; q0-=q1 - psubusb mm3, mm0 ; q1-=q0 - por mm4, mm3 ; abs(q0-q1) - movq t0, mm4 ; save to t0 - psubusb mm4, mm7 - por mm1, mm4 - - - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - neg rax ; negate pitch to deal with above border - - movq mm2, [rsi+4*rax] ; p3 - movq mm4, [rdi+4*rax] ; p2 - movq mm5, mm4 ; p2 - psubusb mm4, mm2 ; p2-=p3 - psubusb mm2, mm5 ; p3-=p2 - por mm4, mm2 ; abs(p3 - p2) - psubusb mm4, mm7 - por mm1, mm4 - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - movq mm4, [rsi+2*rax] ; p1 - movq mm3, mm4 ; p1 - psubusb mm4, mm5 ; p1-=p2 - psubusb mm5, mm3 ; p2-=p1 - por mm4, mm5 ; abs(p2 - p1) - psubusb mm4, mm7 - por mm1, mm4 - - movq mm2, mm3 ; p1 - - - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - movq mm4, [rsi+rax] ; p0 - movq mm5, mm4 ; p0 - psubusb mm4, mm3 ; p0-=p1 - psubusb mm3, mm5 ; p1-=p0 - por mm4, mm3 ; abs(p1 - p0) - movq t1, mm4 ; save to t1 - psubusb mm4, mm7 - por mm1, mm4 - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm5 = p0 - movq mm3, [rdi] ; q1 - movq mm4, mm3 ; q1 - psubusb mm3, mm2 ; q1-=p1 - psubusb mm2, mm4 ; p1-=q1 - por mm2, mm3 ; abs(p1-q1) - pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm2, 1 ; abs(p1-q1)/2 - - movq mm6, mm5 ; p0 - movq mm3, mm0 ; q0 - psubusb mm5, mm3 ; p0-=q0 - psubusb mm3, mm6 ; q0-=p0 - por mm5, mm3 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] ; blimit - - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm5 - pxor mm5, mm5 - pcmpeqb mm1, mm5 ; mask mm1 - - ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm6 = p0, - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb mm4, mm5 - - pcmpeqb mm5, mm5 - pxor mm4, mm5 - - - - ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm6 = p0, mm4=hev - ; start work on filters - movq mm2, [rsi+2*rax] ; p1 - movq mm7, [rdi] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) - paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - - ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 - movq mm2, mm1 ; vp8_filter - pand mm2, mm4; ; Filter2 = vp8_filter & hev - - movq mm5, mm2 ; - paddsb mm5, [GLOBAL(t3)]; - - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm5 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm5 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - movq mm5, mm0 ; Filter2 - - paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm2 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm2 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 - psubsb mm3, mm0 ; qs0 =qs0 - filter1 - paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 - - ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 - ; vp8_filter &= ~hev; - ; Filter2 = vp8_filter; - pandn mm4, mm1 ; vp8_filter&=~hev - - - ; mm3=qs0, mm4=filter2, mm6=ps0 - - ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); - ; s = vp8_signed_char_clamp(qs0 - u); - ; *oq0 = s^0x80; - ; s = vp8_signed_char_clamp(ps0 + u); - ; *op0 = s^0x80; - pxor mm0, mm0 - - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s27)] - pmulhw mm2, [GLOBAL(s27)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - psubsb mm3, mm1 - paddsb mm6, mm1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - movq [rsi+rax], mm6 - movq [rsi], mm3 - - ; roughly 2/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); - ; s = vp8_signed_char_clamp(qs1 - u); - ; *oq1 = s^0x80; - ; s = vp8_signed_char_clamp(ps1 + u); - ; *op1 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s18)] - pmulhw mm2, [GLOBAL(s18)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm3, [rdi] - movq mm6, [rsi+rax*2] ; p1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdi], mm3 - movq [rsi+rax*2], mm6 - - ; roughly 1/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); - ; s = vp8_signed_char_clamp(qs2 - u); - ; *oq2 = s^0x80; - ; s = vp8_signed_char_clamp(ps2 + u); - ; *op2 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s9)] - pmulhw mm2, [GLOBAL(s9)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - - movq mm6, [rdi+rax*4] - neg rax - movq mm3, [rdi+rax ] - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdi+rax ], mm3 - neg rax - movq [rdi+rax*4], mm6 - -;EARLY_BREAK_OUT: - neg rax - add rsi,8 - dec rcx - jnz .next8_mbh - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE -sym(vp8_mbloop_filter_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4 - 4] - - movsxd rcx, dword ptr arg(5) ;count -.next8_mbv: - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - - ;transpose - movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 - movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 - - movq mm7, mm6 ; 77 76 75 74 73 72 71 70 - punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 - - punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 - movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 - - movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 - movq mm5, mm4 ; 47 46 45 44 43 42 41 40 - - punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 - punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 - - movq mm3, mm5 ; 57 47 56 46 55 45 54 44 - punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 - - punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 - movq mm2, mm4 ; 53 43 52 42 51 41 50 40 - - punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 - punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 - - neg rax - - movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 - movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 - - movq mm1, mm6 ; 27 26 25 24 23 22 21 20 - punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 - - punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 - - movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 - punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 - - movq mm0, mm7 ; 17 07 16 06 15 05 14 04 - punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 - - punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 - movq mm6, mm7 ; 37 27 17 07 36 26 16 06 - - punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 - punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 - - lea rdx, srct - movq mm5, mm6 ; 76 66 56 46 36 26 16 06 - - movq [rdx+56], mm7 - psubusb mm5, mm7 ; q2-q3 - - - movq [rdx+48], mm6 - psubusb mm7, mm6 ; q3-q2 - - por mm7, mm5; ; mm7=abs (q3-q2) - movq mm5, mm0 ; 35 25 15 05 34 24 14 04 - - punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 - punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 - - movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 - psubusb mm3, mm6 ; q1-q2 - - psubusb mm6, mm5 ; q2-q1 - por mm6, mm3 ; mm6=abs(q2-q1) - - movq [rdx+40], mm5 ; save q1 - movq [rdx+32], mm0 ; save q0 - - movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 - punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 - - movq mm0, mm3 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 31 21 11 01 30 20 10 00 - - punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 - punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 - - movq [rdx], mm0 ; save p3 - movq [rdx+8], mm1 ; save p2 - - movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 - psubusb mm2, mm0 ; p2-p3 - - psubusb mm0, mm1 ; p3-p2 - por mm0, mm2 ; mm0=abs(p3-p2) - - movq mm2, mm3 ; 33 23 13 03 32 22 12 02 - punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 - - punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 - movq [rdx+24], mm3 ; save p0 - - movq [rdx+16], mm2 ; save p1 - movq mm5, mm2 ; mm5 = p1 - - psubusb mm2, mm1 ; p1-p2 - psubusb mm1, mm5 ; p2-p1 - - por mm1, mm2 ; mm1=abs(p2-p1) - mov rdx, arg(3) ;limit - - movq mm4, [rdx] ; mm4 = limit - psubusb mm7, mm4 ; abs(q3-q2) > limit - - psubusb mm0, mm4 ; abs(p3-p2) > limit - psubusb mm1, mm4 ; abs(p2-p1) > limit - - psubusb mm6, mm4 ; abs(q2-q1) > limit - por mm7, mm6 ; or - - por mm0, mm1 ; - por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit - - movq mm1, mm5 ; p1 - - movq mm7, mm3 ; mm3=mm7=p0 - psubusb mm7, mm5 ; p0 - p1 - - psubusb mm5, mm3 ; p1 - p0 - por mm5, mm7 ; abs(p1-p0) - - movq t0, mm5 ; save abs(p1-p0) - lea rdx, srct - - psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit - por mm0, mm5 ; mm0=mask - - movq mm5, [rdx+32] ; mm5=q0 - movq mm7, [rdx+40] ; mm7=q1 - - movq mm6, mm5 ; mm6=q0 - movq mm2, mm7 ; q1 - psubusb mm5, mm7 ; q0-q1 - - psubusb mm7, mm6 ; q1-q0 - por mm7, mm5 ; abs(q1-q0) - - movq t1, mm7 ; save abs(q1-q0) - psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit - - por mm0, mm7 ; mask - - movq mm5, mm2 ; q1 - psubusb mm5, mm1 ; q1-=p1 - psubusb mm1, mm2 ; p1-=q1 - por mm5, mm1 ; abs(p1-q1) - pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm5, 1 ; abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; - - movq mm4, [rdx] ;blimit - movq mm1, mm3 ; mm1=mm3=p0 - - movq mm7, mm6 ; mm7=mm6=q0 - psubusb mm1, mm7 ; p0-q0 - - psubusb mm7, mm3 ; q0-p0 - por mm1, mm7 ; abs(q0-p0) - paddusb mm1, mm1 ; abs(q0-p0)*2 - paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm0; ; mask - - pxor mm0, mm0 - pcmpeqb mm1, mm0 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] - ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 ; abs(q1 - q0) > thresh - - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 ; abs(p1 - p0)> thresh - - por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - pcmpeqb mm4, mm0 - - pcmpeqb mm0, mm0 - pxor mm4, mm0 - - - - - ; start work on filters - lea rdx, srct - - ; start work on filters - movq mm2, [rdx+16] ; p1 - movq mm7, [rdx+40] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - movq mm6, [rdx+24] ; p0 - movq mm0, [rdx+32] ; q0 - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) - paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 - movq mm2, mm1 ; vp8_filter - pand mm2, mm4; ; Filter2 = vp8_filter & hev - - movq mm5, mm2 ; - paddsb mm5, [GLOBAL(t3)]; - - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm5 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm5 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - movq mm5, mm0 ; Filter2 - - paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm2 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm2 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 - psubsb mm3, mm0 ; qs0 =qs0 - filter1 - paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 - - ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 - ; vp8_filter &= ~hev; - ; Filter2 = vp8_filter; - pandn mm4, mm1 ; vp8_filter&=~hev - - - ; mm3=qs0, mm4=filter2, mm6=ps0 - - ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); - ; s = vp8_signed_char_clamp(qs0 - u); - ; *oq0 = s^0x80; - ; s = vp8_signed_char_clamp(ps0 + u); - ; *op0 = s^0x80; - pxor mm0, mm0 - - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s27)] - pmulhw mm2, [GLOBAL(s27)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - psubsb mm3, mm1 - paddsb mm6, mm1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - movq [rdx+24], mm6 - movq [rdx+32], mm3 - - ; roughly 2/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); - ; s = vp8_signed_char_clamp(qs1 - u); - ; *oq1 = s^0x80; - ; s = vp8_signed_char_clamp(ps1 + u); - ; *op1 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s18)] - pmulhw mm2, [GLOBAL(s18)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm3, [rdx + 40] - movq mm6, [rdx + 16] ; p1 - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdx + 40], mm3 - movq [rdx + 16], mm6 - - ; roughly 1/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); - ; s = vp8_signed_char_clamp(qs2 - u); - ; *oq2 = s^0x80; - ; s = vp8_signed_char_clamp(ps2 + u); - ; *op2 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s9)] - pmulhw mm2, [GLOBAL(s9)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm6, [rdx+ 8] - movq mm3, [rdx+48] - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 - pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 - - ; transpose and write back - movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 - movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 - - punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 - punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 - - movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 - movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 - - punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 - punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 - - movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 - punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 - - punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 - movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 - - punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 - punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 - - movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 - punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 - - movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 - punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 - - movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 - punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 - - punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 - movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 - - punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 - punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 - - movq [rsi+rax*4], mm0 ; write out - movq [rdi+rax*4], mm6 ; write out - - movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 - punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 - - punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 - movq [rsi+rax*2], mm0 ; write out - - movq [rdi+rax*2], mm5 ; write out - movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 - - punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 - punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 - - movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 - punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 - - punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 - movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 - - movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 - punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 - - punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 - movq [rsi], mm0 ; write out - - movq [rdi], mm1 ; write out - neg rax - - punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 - punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 - - movq [rsi+rax*2], mm3 - movq [rdi+rax*2], mm4 - - lea rsi, [rsi+rax*8] - dec rcx - - jnz .next8_mbv - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_loop_filter_simple_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE -sym(vp8_loop_filter_simple_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - mov rcx, 2 ; count -.nexts8_h: - mov rdx, arg(2) ;blimit ; get blimit - movq mm3, [rdx] ; - - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - neg rax - - ; calculate mask - movq mm1, [rsi+2*rax] ; p1 - movq mm0, [rdi] ; q1 - movq mm2, mm1 - movq mm7, mm0 - movq mm4, mm0 - psubusb mm0, mm1 ; q1-=p1 - psubusb mm1, mm4 ; p1-=q1 - por mm1, mm0 ; abs(p1-q1) - pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm1, 1 ; abs(p1-q1)/2 - - movq mm5, [rsi+rax] ; p0 - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - movq mm6, mm5 ; p0 - psubusb mm5, mm4 ; p0-=q0 - psubusb mm4, mm6 ; q0-=p0 - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm3, mm3 - pcmpeqb mm5, mm3 - - ; start work on filters - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) - pand mm5, mm2 ; mask filter values we don't care about - - ; do + 4 side - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - movq mm1, mm5 ; get a copy of filters - psraw mm1, 11 ; arithmetic shift right 11 - psllw mm1, 8 ; shift left 8 to put it back - - por mm0, mm1 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - movq [rsi], mm3 ; write back - - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+rax], mm6 ; write back - - add rsi,8 - neg rax - dec rcx - jnz .nexts8_h - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_loop_filter_simple_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE -sym(vp8_loop_filter_simple_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4- 2]; ; - mov rcx, 2 ; count -.nexts8_v: - - lea rdi, [rsi + rax]; - movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 - - movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 - punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 - - movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 - movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 - - punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 - movq mm5, mm4 ; 53 43 52 42 51 41 50 40 - - punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 - punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 - - neg rax - - movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 - movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 - - punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 - movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 - - movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 - punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 - - movq mm2, mm0 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 13 03 12 02 11 01 10 00 - - punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 - movq mm3, mm2 ; 33 23 13 03 32 22 12 02 - - punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 - punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 - - punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 - - - ; calculate mask - movq mm6, mm0 ; p1 - movq mm7, mm3 ; q1 - psubusb mm7, mm6 ; q1-=p1 - psubusb mm6, mm3 ; p1-=q1 - por mm6, mm7 ; abs(p1-q1) - pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm6, 1 ; abs(p1-q1)/2 - - movq mm5, mm1 ; p0 - movq mm4, mm2 ; q0 - - psubusb mm5, mm2 ; p0-=q0 - psubusb mm4, mm1 ; q0-=p0 - - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] - - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm7, mm7 - pcmpeqb mm5, mm7 ; mm5 = mask - - ; start work on filters - movq t0, mm0 - movq t1, mm3 - - pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb mm0, mm3 ; p1 - q1 - movq mm6, mm1 ; p0 - - movq mm7, mm2 ; q0 - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - - pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm7 ; offseted ; q0 - - psubsb mm7, mm6 ; q0 - p0 - paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) - - paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) - - pand mm5, mm0 ; mask filter values we don't care about - - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - movq mm7, mm5 ; get a copy of filters - psraw mm7, 11 ; arithmetic shift right 11 - psllw mm7, 8 ; shift left 8 to put it back - - por mm0, mm7 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0sz add - pxor mm3, [GLOBAL(t80)] ; unoffset - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - - - movq mm0, t0 - movq mm4, t1 - - ; mm0 = 70 60 50 40 30 20 10 00 - ; mm6 = 71 61 51 41 31 21 11 01 - ; mm3 = 72 62 52 42 32 22 12 02 - ; mm4 = 73 63 53 43 33 23 13 03 - ; transpose back to write out - - movq mm1, mm0 ; - punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 - - punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 - movq mm2, mm3 ; - - punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 - movq mm5, mm1 ; 71 70 61 60 51 50 41 40 - - punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 - movq mm6, mm0 ; 31 30 21 20 11 10 01 00 - - punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 - punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 - - movd [rsi+rax*4], mm0 ; write 03 02 01 00 - punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 - - psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 - punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 - - movd [rdi+rax*4], mm0 ; write 13 12 11 10 - movd [rsi+rax*2], mm6 ; write 23 22 21 20 - - psrlq mm6, 32 ; 33 32 31 30 - movd [rsi], mm1 ; write 43 42 41 40 - - movd [rsi + rax], mm6 ; write 33 32 31 30 - neg rax - - movd [rsi + rax*2], mm5 ; write 63 62 61 60 - psrlq mm1, 32 ; 53 52 51 50 - - movd [rdi], mm1 ; write out 53 52 51 50 - psrlq mm5, 32 ; 73 72 71 70 - - movd [rdi + rax*2], mm5 ; write 73 72 71 70 - - lea rsi, [rsi+rax*8] ; next 8 - - dec rcx - jnz .nexts8_v - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - -;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, -; int y_stride, -; loop_filter_info *lfi) -;{ -; -; -; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -;} - -SECTION_RODATA -align 16 -tfe: - times 8 db 0xfe -align 16 -t80: - times 8 db 0x80 -align 16 -t1s: - times 8 db 0x01 -align 16 -t3: - times 8 db 0x03 -align 16 -t4: - times 8 db 0x04 -align 16 -ones: - times 4 dw 0x0001 -align 16 -s27: - times 4 dw 0x1b00 -align 16 -s18: - times 4 dw 0x1200 -align 16 -s9: - times 4 dw 0x0900 -align 16 -s63: - times 4 dw 0x003f diff --git a/libs/libvpx/vp8/decoder/decodeframe.c b/libs/libvpx/vp8/decoder/decodeframe.c index 0aec2a01b0..077bd3da26 100644 --- a/libs/libvpx/vp8/decoder/decodeframe.c +++ b/libs/libvpx/vp8/decoder/decodeframe.c @@ -930,7 +930,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { /* When error concealment is enabled we should only check the sync * code if we have enough bits available */ - if (!pbi->ec_active || data + 3 < data_end) { + if (data + 3 < data_end) { if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) { vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame sync code"); @@ -941,13 +941,19 @@ int vp8_decode_frame(VP8D_COMP *pbi) { * if we have enough data. Otherwise we will end up with the wrong * size. */ - if (!pbi->ec_active || data + 6 < data_end) { + if (data + 6 < data_end) { pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff; pc->horiz_scale = clear[4] >> 6; pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff; pc->vert_scale = clear[6] >> 6; + data += 7; + } else if (!pbi->ec_active) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated key frame header"); + } else { + /* Error concealment is active, clear the frame. */ + data = data_end; } - data += 7; } else { memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); @@ -1199,7 +1205,8 @@ int vp8_decode_frame(VP8D_COMP *pbi) { pbi->frame_corrupt_residual = 0; #if CONFIG_MULTITHREAD - if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) && + pc->multi_token_partition != ONE_PARTITION) { unsigned int thread; vp8mt_decode_mb_rows(pbi, xd); vp8_yv12_extend_frame_borders(yv12_fb_new); diff --git a/libs/libvpx/vp8/decoder/decodemv.c b/libs/libvpx/vp8/decoder/decodemv.c index b946ab73d0..8e9600c6da 100644 --- a/libs/libvpx/vp8/decoder/decodemv.c +++ b/libs/libvpx/vp8/decoder/decodemv.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "decodemv.h" #include "treereader.h" #include "vp8/common/entropymv.h" #include "vp8/common/entropymode.h" @@ -64,8 +65,7 @@ static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) { const vp8_prob *const p = (const vp8_prob *)mvc; int x = 0; - if (vp8_read(r, p[mvpis_short])) /* Large */ - { + if (vp8_read(r, p[mvpis_short])) { /* Large */ int i = 0; do { @@ -284,8 +284,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi) { vp8_reader *const bc = &pbi->mbc[8]; mbmi->ref_frame = (MV_REFERENCE_FRAME)vp8_read(bc, pbi->prob_intra); - if (mbmi->ref_frame) /* inter MB */ - { + if (mbmi->ref_frame) { /* inter MB */ enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; int cnt[4]; int *cntx = cnt; diff --git a/libs/libvpx/vp8/decoder/onyxd_if.c b/libs/libvpx/vp8/decoder/onyxd_if.c index c6b5660364..f516eb0c78 100644 --- a/libs/libvpx/vp8/decoder/onyxd_if.c +++ b/libs/libvpx/vp8/decoder/onyxd_if.c @@ -29,6 +29,7 @@ #include "./vpx_scale_rtcd.h" #include "vpx_scale/vpx_scale.h" #include "vp8/common/systemdependent.h" +#include "vpx_ports/system_state.h" #include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" #include "detokenize.h" @@ -40,7 +41,6 @@ #endif extern void vp8_init_loop_filter(VP8_COMMON *cm); -extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi); static int get_free_fb(VP8_COMMON *cm); static void ref_cnt_fb(int *buf, int *idx, int new_idx); @@ -352,7 +352,7 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, goto decode_exit; } - vp8_clear_system_state(); + vpx_clear_system_state(); if (cm->show_frame) { cm->current_video_frame++; @@ -383,7 +383,7 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, decode_exit: pbi->common.error.setjmp = 0; - vp8_clear_system_state(); + vpx_clear_system_state(); return retcode; } int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, @@ -416,7 +416,7 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, } #endif /*!CONFIG_POSTPROC*/ - vp8_clear_system_state(); + vpx_clear_system_state(); return ret; } @@ -438,47 +438,39 @@ int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame) { } int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) { - if (!fb->use_frame_threads) { - /* decoder instance for single thread mode */ - fb->pbi[0] = create_decompressor(oxcf); - if (!fb->pbi[0]) return VPX_CODEC_ERROR; + /* decoder instance for single thread mode */ + fb->pbi[0] = create_decompressor(oxcf); + if (!fb->pbi[0]) return VPX_CODEC_ERROR; #if CONFIG_MULTITHREAD - if (setjmp(fb->pbi[0]->common.error.jmp)) { - vp8_remove_decoder_instances(fb); - memset(fb->pbi, 0, sizeof(fb->pbi) / sizeof(fb->pbi[0])); - vp8_clear_system_state(); - return VPX_CODEC_ERROR; - } - - fb->pbi[0]->common.error.setjmp = 1; - fb->pbi[0]->max_threads = oxcf->max_threads; - vp8_decoder_create_threads(fb->pbi[0]); - fb->pbi[0]->common.error.setjmp = 0; -#endif - } else { - /* TODO : create frame threads and decoder instances for each - * thread here */ + if (setjmp(fb->pbi[0]->common.error.jmp)) { + vp8_remove_decoder_instances(fb); + memset(fb->pbi, 0, sizeof(fb->pbi)); + vpx_clear_system_state(); + return VPX_CODEC_ERROR; } + fb->pbi[0]->common.error.setjmp = 1; + fb->pbi[0]->max_threads = oxcf->max_threads; + vp8_decoder_create_threads(fb->pbi[0]); + fb->pbi[0]->common.error.setjmp = 0; +#endif return VPX_CODEC_OK; } int vp8_remove_decoder_instances(struct frame_buffers *fb) { - if (!fb->use_frame_threads) { - VP8D_COMP *pbi = fb->pbi[0]; + VP8D_COMP *pbi = fb->pbi[0]; - if (!pbi) return VPX_CODEC_ERROR; + if (!pbi) return VPX_CODEC_ERROR; #if CONFIG_MULTITHREAD - vp8_decoder_remove_threads(pbi); + vp8_decoder_remove_threads(pbi); #endif - /* decoder instance for single thread mode */ - remove_decompressor(pbi); - } else { - /* TODO : remove frame threads and decoder instances for each - * thread here */ - } - + /* decoder instance for single thread mode */ + remove_decompressor(pbi); return VPX_CODEC_OK; } + +int vp8dx_get_quantizer(const VP8D_COMP *cpi) { + return cpi->common.base_qindex; +} diff --git a/libs/libvpx/vp8/decoder/onyxd_int.h b/libs/libvpx/vp8/decoder/onyxd_int.h index 3a8a431465..5ecacdbb97 100644 --- a/libs/libvpx/vp8/decoder/onyxd_int.h +++ b/libs/libvpx/vp8/decoder/onyxd_int.h @@ -47,9 +47,6 @@ struct frame_buffers { * this struct will be populated with frame buffer management * info in future commits. */ - /* enable/disable frame-based threading */ - int use_frame_threads; - /* decoder instances */ struct VP8D_COMP *pbi[MAX_FB_MT_DEC]; }; @@ -70,7 +67,8 @@ typedef struct VP8D_COMP { #if CONFIG_MULTITHREAD /* variable for threading */ - volatile int b_multithreaded_rd; + + vpx_atomic_int b_multithreaded_rd; int max_threads; int current_mb_col_main; unsigned int decoding_thread_count; @@ -78,7 +76,8 @@ typedef struct VP8D_COMP { int mt_baseline_filter_level[MAX_MB_SEGMENTS]; int sync_range; - int *mt_current_mb_col; /* Each row remembers its already decoded column. */ + /* Each row remembers its already decoded column. */ + vpx_atomic_int *mt_current_mb_col; unsigned char **mt_yabove_row; /* mb_rows x width */ unsigned char **mt_uabove_row; @@ -119,6 +118,8 @@ typedef struct VP8D_COMP { void *decrypt_state; } VP8D_COMP; +void vp8cx_init_de_quantizer(VP8D_COMP *pbi); +void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); int vp8_decode_frame(VP8D_COMP *cpi); int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf); diff --git a/libs/libvpx/vp8/decoder/threading.c b/libs/libvpx/vp8/decoder/threading.c index 88f7723e33..d0213f75c1 100644 --- a/libs/libvpx/vp8/decoder/threading.c +++ b/libs/libvpx/vp8/decoder/threading.c @@ -20,6 +20,7 @@ #include "vp8/common/loopfilter.h" #include "vp8/common/extend.h" #include "vpx_ports/vpx_timer.h" +#include "decoderthreading.h" #include "detokenize.h" #include "vp8/common/reconintra4x4.h" #include "vp8/common/reconinter.h" @@ -36,8 +37,6 @@ memset((p), 0, (n) * sizeof(*(p))); \ } while (0) -void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); - static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count) { VP8_COMMON *const pc = &pbi->common; @@ -50,9 +49,6 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, mbd->subpixel_predict8x8 = xd->subpixel_predict8x8; mbd->subpixel_predict16x16 = xd->subpixel_predict16x16; - mbd->mode_info_context = pc->mi + pc->mode_info_stride * (i + 1); - mbd->mode_info_stride = pc->mode_info_stride; - mbd->frame_type = pc->frame_type; mbd->pre = xd->pre; mbd->dst = xd->dst; @@ -83,7 +79,8 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8; } - for (i = 0; i < pc->mb_rows; ++i) pbi->mt_current_mb_col[i] = -1; + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1); } static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, @@ -251,12 +248,13 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row) { - volatile const int *last_row_current_mb_col; - volatile int *current_mb_col; + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col; int mb_row; VP8_COMMON *pc = &pbi->common; const int nsync = pbi->sync_range; - const int first_row_no_sync_above = pc->mb_cols + nsync; + const vpx_atomic_int first_row_no_sync_above = + VPX_ATOMIC_INIT(pc->mb_cols + nsync); int num_part = 1 << pbi->common.multi_token_partition; int last_mb_row = start_mb_row; @@ -289,6 +287,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->up_available = (start_mb_row != 0); + xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row; + xd->mode_info_stride = pc->mode_info_stride; + for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1)) { int recon_yoffset, recon_uvoffset; @@ -318,7 +319,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->left_available = 0; - xd->mb_to_top_edge = -((mb_row * 16)) << 3; + xd->mb_to_top_edge = -((mb_row * 16) << 3); xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; if (pbi->common.filter_level) { @@ -355,14 +356,13 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->dst.uv_stride); } - for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) { - *current_mb_col = mb_col - 1; + for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) { + if (((mb_col - 1) % nsync) == 0) { + vpx_atomic_store_release(current_mb_col, mb_col - 1); + } - if ((mb_col & (nsync - 1)) == 0) { - while (mb_col > (*last_row_current_mb_col - nsync)) { - x86_pause_hint(); - thread_sleep(1); - } + if (mb_row && !(mb_col & (nsync - 1))) { + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } /* Distance of MB to the various image edges. @@ -548,7 +548,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, } /* last MB of row is ready just after extension is done */ - *current_mb_col = mb_col + nsync; + vpx_atomic_store_release(current_mb_col, mb_col + nsync); ++xd->mode_info_context; /* skip prediction column */ xd->up_available = 1; @@ -568,10 +568,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (pbi->b_multithreaded_rd == 0) break; + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break; if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { - if (pbi->b_multithreaded_rd == 0) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) { break; } else { MACROBLOCKD *xd = &mbrd->mbd; @@ -589,7 +589,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { int core_count = 0; unsigned int ithread; - pbi->b_multithreaded_rd = 0; + vpx_atomic_init(&pbi->b_multithreaded_rd, 0); pbi->allocated_decoding_thread_count = 0; /* limit decoding threads to the max number of token partitions */ @@ -601,7 +601,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { } if (core_count > 1) { - pbi->b_multithreaded_rd = 1; + vpx_atomic_init(&pbi->b_multithreaded_rd, 1); pbi->decoding_thread_count = core_count - 1; CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count); @@ -712,7 +712,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { int i; int uv_width; - if (pbi->b_multithreaded_rd) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows); /* our internal buffers are always multiples of 16 */ @@ -730,27 +730,33 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { uv_width = width >> 1; - /* Allocate an int for each mb row. */ - CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows); + /* Allocate a vpx_atomic_int for each mb row. */ + CHECK_MEM_ERROR(pbi->mt_current_mb_col, + vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows)); + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_init(&pbi->mt_current_mb_col[i], 0); /* Allocate memory for above_row buffers. */ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_yabove_row[i], - vpx_memalign(16, sizeof(unsigned char) * - (width + (VP8BORDERINPIXELS << 1)))); + CHECK_MEM_ERROR( + pbi->mt_yabove_row[i], + vpx_memalign( + 16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1)))); CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_uabove_row[i], - vpx_memalign(16, sizeof(unsigned char) * - (uv_width + VP8BORDERINPIXELS))); + CHECK_MEM_ERROR( + pbi->mt_uabove_row[i], + vpx_memalign(16, + sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_vabove_row[i], - vpx_memalign(16, sizeof(unsigned char) * - (uv_width + VP8BORDERINPIXELS))); + CHECK_MEM_ERROR( + pbi->mt_vabove_row[i], + vpx_memalign(16, + sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); /* Allocate memory for left_col buffers. */ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); @@ -772,9 +778,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { void vp8_decoder_remove_threads(VP8D_COMP *pbi) { /* shutdown MB Decoding thread; */ - if (pbi->b_multithreaded_rd) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { int i; - pbi->b_multithreaded_rd = 0; + vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0); /* allow all threads to exit */ for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { diff --git a/libs/libvpx/vp8/encoder/bitstream.c b/libs/libvpx/vp8/encoder/bitstream.c index 1b100cfe8b..8cacb64505 100644 --- a/libs/libvpx/vp8/encoder/bitstream.c +++ b/libs/libvpx/vp8/encoder/bitstream.c @@ -19,6 +19,7 @@ #include #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/system_state.h" #include "bitstream.h" #include "defaultcoefcounts.h" @@ -499,8 +500,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { } write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob); - } else /* inter coded */ - { + } else { /* inter coded */ int_mv best_mv; vp8_prob mv_ref_p[VP8_MVREFS - 1]; @@ -843,7 +843,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi) { int new_intra, new_last, new_garf, oldtotal, newtotal; int ref_frame_cost[MAX_REF_FRAMES]; - vp8_clear_system_state(); + vpx_clear_system_state(); if (cpi->common.frame_type != KEY_FRAME) { if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter))) new_intra = 1; @@ -908,7 +908,7 @@ void vp8_update_coef_probs(VP8_COMP *cpi) { #endif int savings = 0; - vp8_clear_system_state(); + vpx_clear_system_state(); do { int j = 0; @@ -1295,7 +1295,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, #endif - vp8_clear_system_state(); + vpx_clear_system_state(); #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING pack_coef_probs(cpi); @@ -1415,7 +1415,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { pack_mb_row_tokens(cpi, &cpi->bc[1]); } else { vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count); diff --git a/libs/libvpx/vp8/encoder/bitstream.h b/libs/libvpx/vp8/encoder/bitstream.h index 2b196dcd27..ed45bff9e2 100644 --- a/libs/libvpx/vp8/encoder/bitstream.h +++ b/libs/libvpx/vp8/encoder/bitstream.h @@ -15,7 +15,15 @@ extern "C" { #endif +#include "vp8/encoder/treewriter.h" +#include "vp8/encoder/tokenize.h" + void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount); +void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi); +void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra, + int prob_last, int prob_garf); +int vp8_estimate_entropy_savings(struct VP8_COMP *cpi); +void vp8_update_coef_probs(struct VP8_COMP *cpi); #ifdef __cplusplus } // extern "C" diff --git a/libs/libvpx/vp8/encoder/encodeframe.c b/libs/libvpx/vp8/encoder/encodeframe.c index 600ba4061b..9bb0df72d5 100644 --- a/libs/libvpx/vp8/encoder/encodeframe.c +++ b/libs/libvpx/vp8/encoder/encodeframe.c @@ -11,8 +11,12 @@ #include "vpx_config.h" #include "vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" +#include "bitstream.h" #include "encodemb.h" #include "encodemv.h" +#if CONFIG_MULTITHREAD +#include "ethreading.h" +#endif #include "vp8/common/common.h" #include "onyx_int.h" #include "vp8/common/extend.h" @@ -35,13 +39,6 @@ #include "encodeframe.h" extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); -extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra, - int prob_last, int prob_garf); -extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi); -extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); -extern void vp8_auto_select_speed(VP8_COMP *cpi); -extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, - MB_ROW_COMP *mbr_ei, int count); static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x); #ifdef MODE_STATS @@ -344,11 +341,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, #if CONFIG_MULTITHREAD const int nsync = cpi->mt_sync_range; - const int rightmost_col = cm->mb_cols + nsync; - volatile const int *last_row_current_mb_col; - volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync); + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; - if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) { last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; } else { last_row_current_mb_col = &rightmost_col; @@ -418,14 +415,13 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) { - *current_mb_col = mb_col - 1; /* set previous MB done */ + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + if (((mb_col - 1) % nsync) == 0) { + vpx_atomic_store_release(current_mb_col, mb_col - 1); + } - if ((mb_col & (nsync - 1)) == 0) { - while (mb_col > (*last_row_current_mb_col - nsync)) { - x86_pause_hint(); - thread_sleep(1); - } + if (mb_row && !(mb_col & (nsync - 1))) { + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } } #endif @@ -565,7 +561,10 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) *current_mb_col = rightmost_col; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + vpx_atomic_store_release(current_mb_col, + vpx_atomic_load_acquire(&rightmost_col)); + } #endif /* this is to account for the border */ @@ -749,13 +748,14 @@ void vp8_encode_frame(VP8_COMP *cpi) { vpx_usec_timer_start(&emr_timer); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { int i; vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, cpi->encoding_thread_count); - for (i = 0; i < cm->mb_rows; ++i) cpi->mt_current_mb_col[i] = -1; + for (i = 0; i < cm->mb_rows; ++i) + vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); for (i = 0; i < cpi->encoding_thread_count; ++i) { sem_post(&cpi->h_event_start_encoding[i]); diff --git a/libs/libvpx/vp8/encoder/encodeframe.h b/libs/libvpx/vp8/encoder/encodeframe.h index c1d8634927..5274aba412 100644 --- a/libs/libvpx/vp8/encoder/encodeframe.h +++ b/libs/libvpx/vp8/encoder/encodeframe.h @@ -10,24 +10,29 @@ #ifndef VP8_ENCODER_ENCODEFRAME_H_ #define VP8_ENCODER_ENCODEFRAME_H_ +#include "vp8/encoder/tokenize.h" + #ifdef __cplusplus extern "C" { #endif -extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x); -extern void vp8_build_block_offsets(MACROBLOCK *x); +struct VP8_COMP; +struct macroblock; -extern void vp8_setup_block_ptrs(MACROBLOCK *x); +void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x); -extern void vp8_encode_frame(VP8_COMP *cpi); +void vp8_build_block_offsets(struct macroblock *x); -extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, - TOKENEXTRA **t, int recon_yoffset, - int recon_uvoffset, int mb_row, - int mb_col); +void vp8_setup_block_ptrs(struct macroblock *x); -extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, - TOKENEXTRA **t); +void vp8_encode_frame(struct VP8_COMP *cpi); + +int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x, + TOKENEXTRA **t, int recon_yoffset, + int recon_uvoffset, int mb_row, int mb_col); + +int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x, + TOKENEXTRA **t); #ifdef __cplusplus } // extern "C" #endif diff --git a/libs/libvpx/vp8/encoder/encodemv.c b/libs/libvpx/vp8/encoder/encodemv.c index cfc7af663f..ea93ccd710 100644 --- a/libs/libvpx/vp8/encoder/encodemv.c +++ b/libs/libvpx/vp8/encoder/encodemv.c @@ -12,6 +12,7 @@ #include "encodemv.h" #include "vp8/common/entropymode.h" #include "vp8/common/systemdependent.h" +#include "vpx_ports/system_state.h" #include @@ -24,14 +25,12 @@ static void encode_mvcomponent(vp8_writer *const w, const int v, const vp8_prob *p = mvc->prob; const int x = v < 0 ? -v : v; - if (x < mvnum_short) /* Small */ - { + if (x < mvnum_short) { /* Small */ vp8_write(w, 0, p[mvpis_short]); vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3); if (!x) return; /* no sign bit */ - } else /* Large */ - { + } else { /* Large */ int i = 0; vp8_write(w, 1, p[mvpis_short]); @@ -126,7 +125,7 @@ void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, unsigned int cost0 = 0; unsigned int cost1 = 0; - vp8_clear_system_state(); + vpx_clear_system_state(); i = 1; diff --git a/libs/libvpx/vp8/encoder/ethreading.c b/libs/libvpx/vp8/encoder/ethreading.c index 68ca0df08d..55a1528b14 100644 --- a/libs/libvpx/vp8/encoder/ethreading.c +++ b/libs/libvpx/vp8/encoder/ethreading.c @@ -14,6 +14,7 @@ #include "vp8/common/extend.h" #include "bitstream.h" #include "encodeframe.h" +#include "ethreading.h" #if CONFIG_MULTITHREAD @@ -25,11 +26,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) { VP8_COMMON *cm = &cpi->common; while (1) { - if (cpi->b_multi_threaded == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; if (sem_wait(&cpi->h_event_start_lpf) == 0) { /* we're shutting down */ - if (cpi->b_multi_threaded == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; vp8_loopfilter_frame(cpi, cm); @@ -47,7 +48,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (cpi->b_multi_threaded == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { const int nsync = cpi->mt_sync_range; @@ -65,7 +66,10 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int *totalrate = &mbri->totalrate; /* we're shutting down */ - if (cpi->b_multi_threaded == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; + + xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1); + xd->mode_info_stride = cm->mode_info_stride; for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1)) { @@ -76,8 +80,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; int map_index = (mb_row * cm->mb_cols); - volatile const int *last_row_current_mb_col; - volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)]; @@ -103,13 +107,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { /* for each macroblock col in image */ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { - *current_mb_col = mb_col - 1; + if (((mb_col - 1) % nsync) == 0) { + vpx_atomic_store_release(current_mb_col, mb_col - 1); + } - if ((mb_col & (nsync - 1)) == 0) { - while (mb_col > (*last_row_current_mb_col - nsync)) { - x86_pause_hint(); - thread_sleep(1); - } + if (mb_row && !(mb_col & (nsync - 1))) { + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING @@ -281,7 +284,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); - *current_mb_col = mb_col + nsync; + vpx_atomic_store_release(current_mb_col, mb_col + nsync); /* this is to account for the border */ xd->mode_info_context++; @@ -450,9 +453,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1); - mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1); - mbd->mode_info_stride = cm->mode_info_stride; - mbd->frame_type = cm->frame_type; mb->src = *cpi->Source; @@ -488,7 +488,7 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, int vp8cx_create_encoder_threads(VP8_COMP *cpi) { const VP8_COMMON *cm = &cpi->common; - cpi->b_multi_threaded = 0; + vpx_atomic_init(&cpi->b_multi_threaded, 0); cpi->encoding_thread_count = 0; cpi->b_lpf_running = 0; @@ -522,7 +522,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count)); - cpi->b_multi_threaded = 1; + vpx_atomic_store_release(&cpi->b_multi_threaded, 1); cpi->encoding_thread_count = th_count; /* @@ -551,7 +551,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - cpi->b_multi_threaded = 0; + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { pthread_join(cpi->h_encoding_thread[ithread], 0); sem_destroy(&cpi->h_event_start_encoding[ithread]); @@ -579,7 +579,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - cpi->b_multi_threaded = 0; + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { sem_post(&cpi->h_event_start_encoding[ithread]); sem_post(&cpi->h_event_end_encoding[ithread]); @@ -605,9 +605,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { } void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* shutdown other threads */ - cpi->b_multi_threaded = 0; + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); { int i; diff --git a/libs/libvpx/vp8/encoder/ethreading.h b/libs/libvpx/vp8/encoder/ethreading.h new file mode 100644 index 0000000000..95bf73d182 --- /dev/null +++ b/libs/libvpx/vp8/encoder/ethreading.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_ENCODER_ETHREADING_H_ +#define VP8_ENCODER_ETHREADING_H_ + +#include "vp8/encoder/onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct macroblock; + +void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x, + MB_ROW_COMP *mbr_ei, int count); +int vp8cx_create_encoder_threads(struct VP8_COMP *cpi); +void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VP8_ENCODER_ETHREADING_H_ diff --git a/libs/libvpx/vp8/encoder/firstpass.c b/libs/libvpx/vp8/encoder/firstpass.c index cd34e33fb8..70f9243410 100644 --- a/libs/libvpx/vp8/encoder/firstpass.c +++ b/libs/libvpx/vp8/encoder/firstpass.c @@ -26,6 +26,7 @@ #include "vpx_scale/vpx_scale.h" #include "encodemb.h" #include "vp8/common/extend.h" +#include "vpx_ports/system_state.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/swapyv12buffer.h" #include "rdopt.h" @@ -499,7 +500,7 @@ void vp8_first_pass(VP8_COMP *cpi) { zero_ref_mv.as_int = 0; - vp8_clear_system_state(); + vpx_clear_system_state(); x->src = *cpi->Source; xd->pre = *lst_yv12; @@ -741,10 +742,10 @@ void vp8_first_pass(VP8_COMP *cpi) { /* extend the recon for intra prediction */ vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); - vp8_clear_system_state(); + vpx_clear_system_state(); } - vp8_clear_system_state(); + vpx_clear_system_state(); { double weight = 0.0; @@ -1184,9 +1185,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits; current_spend_ratio = - (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) - ? 0.1 - : current_spend_ratio; + (current_spend_ratio > 10.0) + ? 10.0 + : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio; } /* Calculate a correction factor based on the quality of prediction in @@ -1272,8 +1273,9 @@ void vp8_init_second_pass(VP8_COMP *cpi) { * sum duration is not. Its calculated based on the actual durations of * all frames from the first pass. */ - vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / - cpi->twopass.total_stats.duration); + vp8_new_framerate(cpi, + 10000000.0 * cpi->twopass.total_stats.count / + cpi->twopass.total_stats.duration); cpi->output_framerate = cpi->framerate; cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * @@ -1655,7 +1657,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->twopass.gf_group_bits = 0; cpi->twopass.gf_decay_rate = 0; - vp8_clear_system_state(); + vpx_clear_system_state(); start_pos = cpi->twopass.stats_in; @@ -2268,7 +2270,7 @@ void vp8_second_pass(VP8_COMP *cpi) { return; } - vp8_clear_system_state(); + vpx_clear_system_state(); if (EOF == input_stats(cpi, &this_frame)) return; @@ -2543,7 +2545,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { memset(&next_frame, 0, sizeof(next_frame)); - vp8_clear_system_state(); + vpx_clear_system_state(); start_position = cpi->twopass.stats_in; cpi->common.frame_type = KEY_FRAME; diff --git a/libs/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/libs/libvpx/vp8/encoder/mips/mmi/dct_mmi.c new file mode 100644 index 0000000000..1f60a692d0 --- /dev/null +++ b/libs/libvpx/vp8/encoder/mips/mmi/dct_mmi.c @@ -0,0 +1,425 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +/* clang-format off */ +/* TRANSPOSE_4H: transpose 4x4 matrix. + Input: ftmp1,ftmp2,ftmp3,ftmp4 + Output: ftmp1,ftmp2,ftmp3,ftmp4 + Note: ftmp0 always be 0, ftmp5~9 used for temporary value. + */ +#define TRANSPOSE_4H \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" +/* clang-format on */ + +void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { + uint64_t tmp[1]; + int16_t *ip = input; + +#if _MIPS_SIM == _ABIO32 + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f4"); + register double ftmp3 asm("$f6"); + register double ftmp4 asm("$f8"); + register double ftmp5 asm("$f10"); + register double ftmp6 asm("$f12"); + register double ftmp7 asm("$f14"); + register double ftmp8 asm("$f16"); + register double ftmp9 asm("$f18"); + register double ftmp10 asm("$f20"); + register double ftmp11 asm("$f22"); + register double ftmp12 asm("$f24"); +#else + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f1"); + register double ftmp2 asm("$f2"); + register double ftmp3 asm("$f3"); + register double ftmp4 asm("$f4"); + register double ftmp5 asm("$f5"); + register double ftmp6 asm("$f6"); + register double ftmp7 asm("$f7"); + register double ftmp8 asm("$f8"); + register double ftmp9 asm("$f9"); + register double ftmp10 asm("$f10"); + register double ftmp11 asm("$f11"); + register double ftmp12 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + TRANSPOSE_4H + + "ldc1 %[ftmp11], %[ff_ph_8] \n\t" + // f1 + f4 + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // a1 + "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + // f2 + f3 + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + // b1 + "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + // f2 - f3 + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c1 + "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + // f1 - f4 + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + // d1 + "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + // op[0] = a1 + b1 + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // op[2] = a1 - b1 + "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + + // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12 + MMI_LI(%[tmp0], 0x0c) + "mtc1 %[tmp0], %[ftmp11] \n\t" + "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" + "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + + // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12 + "ldc1 %[ftmp12], %[ff_pw_7500] \n\t" + "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + TRANSPOSE_4H + + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + + "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t" + "ldc1 %[ftmp9], %[ff_ph_01] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "ldc1 %[ftmp9], %[ff_ph_07] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_LI(%[tmp0], 0x04) + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + + MMI_LI(%[tmp0], 0x10) + "mtc1 %[tmp0], %[ftmp9] \n\t" + "ldc1 %[ftmp12], %[ff_pw_12000] \n\t" + "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + + "ldc1 %[ftmp12], %[ff_pw_51000] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t" + "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t" + "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t" + + : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2), + [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), + [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), + [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), + [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip) + : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), + [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3), + [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500), + [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000), + [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217), + [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output) + : "memory" + ); +} + +void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { + vp8_short_fdct4x4_mmi(input, output, pitch); + vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch); +} + +void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { + double ftmp[13]; + uint32_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL }; + + __asm__ volatile ( + MMI_LI(%[tmp0], 0x02) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + TRANSPOSE_4H + + "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // d + "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + // c + "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t" + // b + "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t" + + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + + "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + TRANSPOSE_4H + + // op[2], op[0] + "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t" + // op[3], op[1] + "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t" + + // op[6], op[4] + "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t" + // op[7], op[5] + "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t" + + // op[10], op[8] + "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t" + // op[11], op[9] + "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t" + + // op[14], op[12] + "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t" + // op[15], op[13] + "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t" + + // a1, a3 + "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t" + // d1, d3 + "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t" + // c1, c3 + "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t" + // b1, b3 + "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t" + + // a1 + d1, a3 + d3 + "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t" + // b1 + c1, b3 + c3 + "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t" + // b1 - c1, b3 - c3 + "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + // a1 - d1, a3 - d3 + "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t" + + // a2, a4 + "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t" + // d2, d4 + "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t" + // c2, c4 + "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t" + // b2, b4 + "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t" + + // a2 + d2, a4 + d4 + "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + // b2 + c2, b4 + c4 + "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t" + // b2 - c2, b4 - c4 + "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t" + // a2 - d2, a4 - d4 + "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t" + "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t" + "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t" + "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t" + "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t" + "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + + "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + + MMI_LI(%[tmp0], 0x72) + "mtc1 %[tmp0], %[ftmp11] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), + [tmp0]"=&r"(tmp[0]), + [ip]"+&r"(input) + : [op]"r"(output), + [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch), + [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask), + [ff_ph_01]"f"(ff_ph_01) + : "memory" + ); +} diff --git a/libs/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/libs/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c new file mode 100644 index 0000000000..3ccb196ffb --- /dev/null +++ b/libs/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/quant_common.h" + +#define REGULAR_SELECT_EOB(i, rc) \ + z = coeff_ptr[rc]; \ + sz = (z >> 31); \ + x = (z ^ sz) - sz; \ + zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value; \ + if (x >= zbin) { \ + x += round_ptr[rc]; \ + y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \ + if (y) { \ + x = (y ^ sz) - sz; \ + qcoeff_ptr[rc] = x; \ + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } \ + } + +void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { + const int16_t *coeff_ptr = b->coeff; + const int16_t *round_ptr = b->round; + const int16_t *quant_ptr = b->quant_fast; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + const int16_t *dequant_ptr = d->dequant; + const int16_t *inv_zig_zag = vp8_default_inv_zig_zag; + + double ftmp[13]; + uint64_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL }; + int eob = 0; + + __asm__ volatile( + // loop 0 ~ 7 + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t" + "li %[tmp0], 0x0f \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t" + + "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" + "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" + "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[round_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[round_ptr]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "gsldlc1 %[ftmp7], 0x07(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[quant_ptr]) \n\t" + "gsldlc1 %[ftmp8], 0x0f(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x08(%[quant_ptr]) \n\t" + "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp8], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x08(%[qcoeff_ptr]) \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[inv_zig_zag]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp5], %[ftmp5], %[ones] \n\t" + "xor %[ftmp6], %[ftmp6], %[ones] \n\t" + "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[dequant_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[dequant_ptr]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp5], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp6], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x08(%[dqcoeff_ptr]) \n\t" + + // loop 8 ~ 15 + "gsldlc1 %[ftmp1], 0x17(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[coeff_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t" + + "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" + "xor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" + "xor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x10(%[round_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x1f(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x18(%[round_ptr]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "gsldlc1 %[ftmp7], 0x17(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x10(%[quant_ptr]) \n\t" + "gsldlc1 %[ftmp8], 0x1f(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x18(%[quant_ptr]) \n\t" + "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "xor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "xor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp8], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x18(%[qcoeff_ptr]) \n\t" + + "gsldlc1 %[ftmp1], 0x17(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[inv_zig_zag]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "xor %[ftmp5], %[ftmp5], %[ones] \n\t" + "xor %[ftmp6], %[ftmp6], %[ones] \n\t" + "and %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "and %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x10(%[dequant_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x1f(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x18(%[dequant_ptr]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp5], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t" + + "li %[tmp0], 0x10 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t" + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "li %[tmp0], 0xaa \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t" + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "li %[tmp0], 0xffff \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "and %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t" + "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) + : [coeff_ptr] "r"((mips_reg)coeff_ptr), + [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr), + [dequant_ptr] "r"((mips_reg)dequant_ptr), + [round_ptr] "r"((mips_reg)round_ptr), + [quant_ptr] "r"((mips_reg)quant_ptr), + [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr), + [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob), + [ones] "f"(ones) + : "memory"); + + *d->eob = eob; +} + +void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) { + int eob = 0; + int x, y, z, sz, zbin; + const int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + const int16_t *coeff_ptr = b->coeff; + const int16_t *zbin_ptr = b->zbin; + const int16_t *round_ptr = b->round; + const int16_t *quant_ptr = b->quant; + const int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + const int16_t *dequant_ptr = d->dequant; + const int16_t zbin_oq_value = b->zbin_extra; + register double ftmp0 asm("$f0"); + + // memset(qcoeff_ptr, 0, 32); + // memset(dqcoeff_ptr, 0, 32); + /* clang-format off */ + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[qcoeff_ptr]) \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[dqcoeff_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp0) + : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr) + : "memory" + ); + /* clang-format on */ + + REGULAR_SELECT_EOB(1, 0); + REGULAR_SELECT_EOB(2, 1); + REGULAR_SELECT_EOB(3, 4); + REGULAR_SELECT_EOB(4, 8); + REGULAR_SELECT_EOB(5, 5); + REGULAR_SELECT_EOB(6, 2); + REGULAR_SELECT_EOB(7, 3); + REGULAR_SELECT_EOB(8, 6); + REGULAR_SELECT_EOB(9, 9); + REGULAR_SELECT_EOB(10, 12); + REGULAR_SELECT_EOB(11, 13); + REGULAR_SELECT_EOB(12, 10); + REGULAR_SELECT_EOB(13, 7); + REGULAR_SELECT_EOB(14, 11); + REGULAR_SELECT_EOB(15, 14); + REGULAR_SELECT_EOB(16, 15); + + *d->eob = (char)eob; +} diff --git a/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c b/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c index 11f70ae82e..9f5fbd39c8 100644 --- a/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c +++ b/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c @@ -12,10 +12,9 @@ #include "vp8/common/mips/msa/vp8_macros_msa.h" #include "vp8/encoder/block.h" -static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *zbin, - int16_t *round, int16_t *quant, - int16_t *de_quant, int16_t *q_coeff, - int16_t *dq_coeff) { +static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *round, + int16_t *quant, int16_t *de_quant, + int16_t *q_coeff, int16_t *dq_coeff) { int32_t cnt, eob; v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 }; v8i16 round0, round1; @@ -184,15 +183,14 @@ static int8_t exact_regular_quantize_b_msa( void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d) { int16_t *coeff_ptr = b->coeff; - int16_t *zbin_ptr = b->zbin; int16_t *round_ptr = b->round; int16_t *quant_ptr = b->quant_fast; int16_t *qcoeff_ptr = d->qcoeff; int16_t *dqcoeff_ptr = d->dqcoeff; int16_t *dequant_ptr = d->dequant; - *d->eob = fast_quantize_b_msa(coeff_ptr, zbin_ptr, round_ptr, quant_ptr, - dequant_ptr, qcoeff_ptr, dqcoeff_ptr); + *d->eob = fast_quantize_b_msa(coeff_ptr, round_ptr, quant_ptr, dequant_ptr, + qcoeff_ptr, dqcoeff_ptr); } void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d) { diff --git a/libs/libvpx/vp8/encoder/onyx_if.c b/libs/libvpx/vp8/encoder/onyx_if.c index 6ebf233edf..2243182425 100644 --- a/libs/libvpx/vp8/encoder/onyx_if.c +++ b/libs/libvpx/vp8/encoder/onyx_if.c @@ -12,10 +12,12 @@ #include "./vpx_scale_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vp8_rtcd.h" +#include "bitstream.h" #include "vp8/common/onyxc_int.h" #include "vp8/common/blockd.h" #include "onyx_int.h" #include "vp8/common/systemdependent.h" +#include "vp8/common/vp8_skin_detection.h" #include "vp8/encoder/quantize.h" #include "vp8/common/alloccommon.h" #include "mcomp.h" @@ -33,7 +35,9 @@ #include "vp8/common/reconintra.h" #include "vp8/common/swapyv12buffer.h" #include "vp8/common/threading.h" +#include "vpx_ports/system_state.h" #include "vpx_ports/vpx_timer.h" +#include "vpx_util/vpx_write_yuv_frame.h" #if ARCH_ARM #include "vpx_ports/arm.h" #endif @@ -41,35 +45,32 @@ #include "mr_dissim.h" #endif #include "encodeframe.h" +#if CONFIG_MULTITHREAD +#include "ethreading.h" +#endif +#include "picklpf.h" +#if !CONFIG_REALTIME_ONLY +#include "temporal_filter.h" +#endif +#include #include #include #include #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING extern int vp8_update_coef_context(VP8_COMP *cpi); -extern void vp8_update_coef_probs(VP8_COMP *cpi); #endif -extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi); -extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val); -extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi); - extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag); extern void print_parms(VP8_CONFIG *ocf, char *filenam); extern unsigned int vp8_get_processor_freq(); extern void print_tree_update_probs(); -extern int vp8cx_create_encoder_threads(VP8_COMP *cpi); -extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi); - -int vp8_estimate_entropy_savings(VP8_COMP *cpi); int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); -extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance); - static void set_default_lf_deltas(VP8_COMP *cpi); extern const int vp8_gf_interval_table[101]; @@ -85,6 +86,9 @@ FILE *yuv_file; #ifdef OUTPUT_YUV_DENOISED FILE *yuv_denoised_file; #endif +#ifdef OUTPUT_YUV_SKINMAP +static FILE *yuv_skinmap_file = NULL; +#endif #if 0 FILE *framepsnr; @@ -217,7 +221,8 @@ static void save_layer_context(VP8_COMP *cpi) { lc->inter_frame_target = cpi->inter_frame_target; lc->total_byte_count = cpi->total_byte_count; lc->filter_level = cpi->common.filter_level; - + lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot; + lc->force_maxqp = cpi->force_maxqp; lc->last_frame_percent_intra = cpi->last_frame_percent_intra; memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage, @@ -253,7 +258,8 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) { cpi->inter_frame_target = lc->inter_frame_target; cpi->total_byte_count = lc->total_byte_count; cpi->common.filter_level = lc->filter_level; - + cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot; + cpi->force_maxqp = lc->force_maxqp; cpi->last_frame_percent_intra = lc->last_frame_percent_intra; memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage, @@ -602,6 +608,59 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) { set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); } +static void compute_skin_map(VP8_COMP *cpi) { + int mb_row, mb_col, num_bl; + VP8_COMMON *cm = &cpi->common; + const uint8_t *src_y = cpi->Source->y_buffer; + const uint8_t *src_u = cpi->Source->u_buffer; + const uint8_t *src_v = cpi->Source->v_buffer; + const int src_ystride = cpi->Source->y_stride; + const int src_uvstride = cpi->Source->uv_stride; + + const SKIN_DETECTION_BLOCK_SIZE bsize = + (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + num_bl = 0; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + const int bl_index = mb_row * cm->mb_cols + mb_col; + cpi->skin_map[bl_index] = + vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride, + bsize, cpi->consec_zero_last[bl_index], 0); + num_bl++; + src_y += 16; + src_u += 8; + src_v += 8; + } + src_y += (src_ystride << 4) - (num_bl << 4); + src_u += (src_uvstride << 3) - (num_bl << 3); + src_v += (src_uvstride << 3) - (num_bl << 3); + } + + // Remove isolated skin blocks (none of its neighbors are skin) and isolated + // non-skin blocks (all of its neighbors are skin). Skip the boundary. + for (mb_row = 1; mb_row < cm->mb_rows - 1; mb_row++) { + for (mb_col = 1; mb_col < cm->mb_cols - 1; mb_col++) { + const int bl_index = mb_row * cm->mb_cols + mb_col; + int num_neighbor = 0; + int mi, mj; + int non_skin_threshold = 8; + + for (mi = -1; mi <= 1; mi += 1) { + for (mj = -1; mj <= 1; mj += 1) { + int bl_neighbor_index = (mb_row + mi) * cm->mb_cols + mb_col + mj; + if (cpi->skin_map[bl_neighbor_index]) num_neighbor++; + } + } + + if (cpi->skin_map[bl_index] && num_neighbor < 2) + cpi->skin_map[bl_index] = 0; + if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold) + cpi->skin_map[bl_index] = 1; + } + } +} + static void set_default_lf_deltas(VP8_COMP *cpi) { cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1; cpi->mb.e_mbd.mode_ref_lf_delta_update = 1; @@ -714,6 +773,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) { SPEED_FEATURES *sf = &cpi->sf; int Mode = cpi->compressor_speed; int Speed = cpi->Speed; + int Speed2; int i; VP8_COMMON *cm = &cpi->common; int last_improved_quant = sf->improved_quant; @@ -815,9 +875,16 @@ void vp8_set_speed_features(VP8_COMP *cpi) { cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] = cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed, mode_check_freq_map_vhbpred); - cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, mode_check_freq_map_new1); + + // For real-time mode at speed 10 keep the mode_check_freq threshold + // for NEW1 similar to that of speed 9. + Speed2 = Speed; + if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9); + cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1); + cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] = speed_map(Speed, mode_check_freq_map_new2); + cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed, mode_check_freq_map_split1); cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] = @@ -1163,9 +1230,13 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { } if (cpi->oxcf.multi_threaded > 1) { + int i; + vpx_free(cpi->mt_current_mb_col); CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); + for (i = 0; i < cm->mb_rows; ++i) + vpx_atomic_init(&cpi->mt_current_mb_col[i], 0); } #endif @@ -1466,6 +1537,12 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; + // GF behavior for 1 pass CBR, used when error_resilience is off. + if (!cpi->oxcf.error_resilient_mode && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.Mode == MODE_REALTIME) + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) cpi->oxcf.token_partitions = 3; #endif @@ -1476,9 +1553,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { setup_features(cpi); - { + if (!cpi->use_roi_static_threshold) { int i; - for (i = 0; i < MAX_MB_SEGMENTS; ++i) { cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; } @@ -1592,8 +1668,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cm->sharpness_level = cpi->oxcf.Sharpness; if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) { - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + int hr, hs, vr, vs; Scale2Ratio(cm->horiz_scale, &hr, &hs); Scale2Ratio(cm->vert_scale, &vr, &vs); @@ -1739,6 +1814,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->active_map_enabled = 0; + cpi->use_roi_static_threshold = 0; + #if 0 /* Experimental code for lagged and one pass */ /* Initialise one_pass GF frames stats */ @@ -1765,9 +1842,13 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->mse_source_denoised = 0; /* Should we use the cyclic refresh method. - * Currently this is tied to error resilliant mode + * Currently there is no external control for this. + * Enable it for error_resilient_mode, or for 1 pass CBR mode. */ - cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode; + cpi->cyclic_refresh_mode_enabled = + (cpi->oxcf.error_resilient_mode || + (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.Mode <= 2)); cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 7; if (cpi->oxcf.number_of_layers == 1) { @@ -1780,6 +1861,23 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->cyclic_refresh_mode_index = 0; cpi->cyclic_refresh_q = 32; + // GF behavior for 1 pass CBR, used when error_resilience is off. + cpi->gf_update_onepass_cbr = 0; + cpi->gf_noboost_onepass_cbr = 0; + if (!cpi->oxcf.error_resilient_mode && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && cpi->oxcf.Mode <= 2) { + cpi->gf_update_onepass_cbr = 1; + cpi->gf_noboost_onepass_cbr = 1; + cpi->gf_interval_onepass_cbr = + cpi->cyclic_refresh_mode_max_mbs_perframe > 0 + ? (2 * (cpi->common.mb_rows * cpi->common.mb_cols) / + cpi->cyclic_refresh_mode_max_mbs_perframe) + : 10; + cpi->gf_interval_onepass_cbr = + VPXMIN(40, VPXMAX(6, cpi->gf_interval_onepass_cbr)); + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + } + if (cpi->cyclic_refresh_mode_enabled) { CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); @@ -1787,6 +1885,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->cyclic_refresh_map = (signed char *)NULL; } + CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols, + sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR(cpi->consec_zero_last, vpx_calloc(cm->mb_rows * cm->mb_cols, 1)); CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias, @@ -1810,6 +1911,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->common.refresh_alt_ref_frame = 0; cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -1863,6 +1965,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { #ifdef OUTPUT_YUV_DENOISED yuv_denoised_file = fopen("denoised.yuv", "ab"); #endif +#ifdef OUTPUT_YUV_SKINMAP + yuv_skinmap_file = fopen("skinmap.yuv", "wb"); +#endif #if 0 framepsnr = fopen("framepsnr.stt", "a"); @@ -2028,11 +2133,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { double time_encoded = (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / 10000000.000; - double total_encode_time = - (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; - const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000; - const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); if (cpi->b_calculate_psnr) { if (cpi->oxcf.number_of_layers > 1) { @@ -2040,7 +2141,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { fprintf(f, "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t" - "GLPsnrP\tVPXSSIM\t\n"); + "GLPsnrP\tVPXSSIM\n"); for (i = 0; i < (int)cpi->oxcf.number_of_layers; ++i) { double dr = (double)cpi->bytes_in_layer[i] * 8.0 / 1000.0 / time_encoded; @@ -2072,14 +2173,12 @@ void vp8_remove_compressor(VP8_COMP **ptr) { fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t" - "GLPsnrP\tVPXSSIM\tTime(us)\tRc-Err\t" - "Abs Err\n"); + "GLPsnrP\tVPXSSIM\n"); fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" - "%7.3f\t%8.0f\t%7.2f\t%7.2f\n", + "%7.3f\n", dr, cpi->total / cpi->count, total_psnr, - cpi->totalp / cpi->count, total_psnr2, total_ssim, - total_encode_time, rate_err, fabs(rate_err)); + cpi->totalp / cpi->count, total_psnr2, total_ssim); } } fclose(f); @@ -2220,6 +2319,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { dealloc_compressor_data(cpi); vpx_free(cpi->mb.ss); vpx_free(cpi->tok); + vpx_free(cpi->skin_map); vpx_free(cpi->cyclic_refresh_map); vpx_free(cpi->consec_zero_last); vpx_free(cpi->consec_zero_last_mvbias); @@ -2234,6 +2334,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) { #ifdef OUTPUT_YUV_DENOISED fclose(yuv_denoised_file); #endif +#ifdef OUTPUT_YUV_SKINMAP + fclose(yuv_skinmap_file); +#endif #if 0 @@ -2296,7 +2399,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride, recon += recon_stride; } - vp8_clear_system_state(); + vpx_clear_system_state(); return total_sse; } @@ -2410,42 +2513,13 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) { return 0; } -#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) -void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { - unsigned char *src = s->y_buffer; - int h = s->y_height; - - do { - fwrite(src, s->y_width, 1, yuv_file); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; - } while (--h); -} -#endif - static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; /* are we resizing the image */ if (cm->horiz_scale != 0 || cm->vert_scale != 0) { #if CONFIG_SPATIAL_RESAMPLING - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + int hr, hs, vr, vs; int tmp_height; if (cm->vert_scale == 3) { @@ -2478,8 +2552,7 @@ static int resize_key_frame(VP8_COMP *cpi) { */ if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) { - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + int hr, hs, vr, vs; int new_width, new_height; /* If we are below the resample DOWN watermark then scale down a @@ -2691,7 +2764,7 @@ static int decide_key_frame(VP8_COMP *cpi) { if (cpi->Speed > 11) return 0; /* Clear down mmx registers */ - vp8_clear_system_state(); + vpx_clear_system_state(); if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0)) { double change = 1.0 * @@ -2852,8 +2925,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame; cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame; - } else /* For non key frames */ - { + } else { if (cm->refresh_alt_ref_frame) { assert(!cm->copy_buffer_to_arf); @@ -2874,8 +2946,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[ALTREF_FRAME] = cpi->current_ref_frames[LAST_FRAME]; } - } else /* if (cm->copy_buffer_to_arf == 2) */ - { + } else { if (cm->alt_fb_idx != cm->gld_fb_idx) { yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME; yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; @@ -2907,8 +2978,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[GOLDEN_FRAME] = cpi->current_ref_frames[LAST_FRAME]; } - } else /* if (cm->copy_buffer_to_gf == 2) */ - { + } else { if (cm->alt_fb_idx != cm->gld_fb_idx) { yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME; yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; @@ -2939,8 +3009,7 @@ static void update_reference_frames(VP8_COMP *cpi) { int i; for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_running_avg[i]); - } else /* For non key frames */ - { + } else { vp8_yv12_extend_frame_borders( &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); @@ -2995,6 +3064,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source, } // Only return non-zero if we have at least ~1/16 samples for estimate. if (num_blocks > (tot_num_blocks >> 4)) { + assert(num_blocks != 0); return (Total / num_blocks); } else { return 0; @@ -3129,7 +3199,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { } else { struct vpx_usec_timer timer; - vp8_clear_system_state(); + vpx_clear_system_state(); vpx_usec_timer_start(&timer); if (cpi->sf.auto_filter == 0) { @@ -3171,7 +3241,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { } #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ } #endif @@ -3217,7 +3287,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, int drop_mark25 = drop_mark / 8; /* Clear down mmx registers to allow floating point in what follows */ - vp8_clear_system_state(); + vpx_clear_system_state(); if (cpi->force_next_frame_intra) { cm->frame_type = KEY_FRAME; /* delayed intra frame */ @@ -3576,7 +3646,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, * There is some odd behavior for one pass here that needs attention. */ if ((cpi->pass == 2) || (cpi->ni_frames > 150)) { - vp8_clear_system_state(); + vpx_clear_system_state(); Q = cpi->active_worst_quality; @@ -3725,6 +3795,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif + compute_skin_map(cpi); + /* Setup background Q adjustment for error resilient mode. * For multi-layer encodes only enable this for the base layer. */ @@ -3798,11 +3870,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif #ifdef OUTPUT_YUV_SRC - vp8_write_yuv_frame(yuv_file, cpi->Source); + vpx_write_yuv_frame(yuv_file, cpi->Source); #endif do { - vp8_clear_system_state(); + vpx_clear_system_state(); vp8_set_quantizer(cpi, Q); @@ -3927,7 +3999,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, /* transform / motion compensation build reconstruction frame */ vp8_encode_frame(cpi); - if (cpi->oxcf.screen_content_mode == 2) { + if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { if (vp8_drop_encodedframe_overshoot(cpi, Q)) return; } @@ -3935,7 +4007,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0; #endif - vp8_clear_system_state(); + vpx_clear_system_state(); /* Test to see if the stats generated for this frame indicate that * we should have coded a key frame (assuming that we didn't)! @@ -3979,7 +4051,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif } - vp8_clear_system_state(); + vpx_clear_system_state(); if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; @@ -4003,9 +4075,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #if !CONFIG_REALTIME_ONLY top_index = cpi->active_worst_quality; #endif // !CONFIG_REALTIME_ONLY - /* If we have updated the active max Q do not call - * vp8_update_rate_correction_factors() this loop. - */ + /* If we have updated the active max Q do not call + * vp8_update_rate_correction_factors() this loop. + */ active_worst_qchanged = 1; } else { active_worst_qchanged = 0; @@ -4204,6 +4276,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } } while (Loop == 1); +#if defined(DROP_UNCODED_FRAMES) + /* if there are no coded macroblocks at all drop this frame */ + if (cpi->common.MBs == cpi->mb.skip_true_count && + (cpi->drop_frame_count & 7) != 7 && cm->frame_type != KEY_FRAME) { + cpi->common.current_video_frame++; + cpi->frames_since_key++; + cpi->drop_frame_count++; + // We advance the temporal pattern for dropped frames. + cpi->temporal_pattern_counter++; + return; + } + cpi->drop_frame_count = 0; +#endif + #if 0 /* Experimental code for lagged and one pass * Update stats used for one pass GF selection @@ -4345,11 +4431,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif +#ifdef OUTPUT_YUV_SKINMAP + if (cpi->common.current_video_frame > 1) { + vp8_compute_skin_map(cpi, yuv_skinmap_file); + } +#endif + #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* start loopfilter in separate thread */ sem_post(&cpi->h_event_start_lpf); cpi->b_lpf_running = 1; + /* wait for the filter_level to be picked so that we can continue with + * stream packing */ + sem_wait(&cpi->h_event_end_lpf); } else #endif { @@ -4359,7 +4454,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, update_reference_frames(cpi); #ifdef OUTPUT_YUV_DENOISED - vp8_write_yuv_frame(yuv_denoised_file, + vpx_write_yuv_frame(yuv_denoised_file, &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); #endif @@ -4369,12 +4464,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif -#if CONFIG_MULTITHREAD - /* wait that filter_level is picked so that we can continue with stream - * packing */ - if (cpi->b_multi_threaded) sem_wait(&cpi->h_event_end_lpf); -#endif - /* build the bitstream */ vp8_pack_bitstream(cpi, dest, dest_end, size); @@ -4549,7 +4638,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, { FILE *f = fopen("tmp.stt", "a"); - vp8_clear_system_state(); + vpx_clear_system_state(); if (cpi->twopass.total_left_stats.coded_error != 0.0) fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64 @@ -4708,7 +4797,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif /* DEBUG */ - /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */ + /* vpx_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */ } #if !CONFIG_REALTIME_ONLY static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest, @@ -4779,7 +4868,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, if (setjmp(cpi->common.error.jmp)) { cpi->common.error.setjmp = 0; - vp8_clear_system_state(); + vpx_clear_system_state(); return VPX_CODEC_CORRUPT_FRAME; } @@ -4986,7 +5075,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, *size = 0; /* Clear down mmx registers */ - vp8_clear_system_state(); + vpx_clear_system_state(); cm->frame_type = INTER_FRAME; cm->frame_flags = *frame_flags; @@ -5139,7 +5228,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0); - vp8_clear_system_state(); + vpx_clear_system_state(); ye = calc_plane_error(orig->y_buffer, orig->y_stride, pp->y_buffer, pp->y_stride, y_width, y_height); @@ -5216,7 +5305,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, #if CONFIG_MULTITHREAD /* wait for the lpf thread done */ - if (cpi->b_multi_threaded && cpi->b_lpf_running) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { sem_wait(&cpi->h_event_end_lpf); cpi->b_lpf_running = 0; } @@ -5249,7 +5338,7 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, } #endif - vp8_clear_system_state(); + vpx_clear_system_state(); return ret; } } @@ -5262,9 +5351,6 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, const int range = 63; int i; - // This method is currently incompatible with the cyclic refresh method - if (cpi->cyclic_refresh_mode_enabled) return -1; - // Check number of rows and columns match if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) { return -1; @@ -5283,7 +5369,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, return -1; } - if (!map) { + // Also disable segmentation if no deltas are specified. + if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 && + delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 && + delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 && + threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) { disable_segmentation(cpi); return 0; } @@ -5320,6 +5410,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, /* Initialise the feature data structure */ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); + if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 || + threshold[3] != 0) + cpi->use_roi_static_threshold = 1; + cpi->cyclic_refresh_mode_enabled = 0; + return 0; } diff --git a/libs/libvpx/vp8/encoder/onyx_int.h b/libs/libvpx/vp8/encoder/onyx_int.h index 59ad5773a6..c489b46c2d 100644 --- a/libs/libvpx/vp8/encoder/onyx_int.h +++ b/libs/libvpx/vp8/encoder/onyx_int.h @@ -249,6 +249,10 @@ typedef struct { int filter_level; + int frames_since_last_drop_overshoot; + + int force_maxqp; + int last_frame_percent_intra; int count_mb_ref_frame_usage[MAX_REF_FRAMES]; @@ -413,6 +417,9 @@ typedef struct VP8_COMP { int drop_frames_allowed; /* Are we permitted to drop frames? */ int drop_frame; /* Drop this frame? */ +#if defined(DROP_UNCODED_FRAMES) + int drop_frame_count; +#endif vp8_prob frame_coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [ENTROPY_NODES]; @@ -468,6 +475,8 @@ typedef struct VP8_COMP { int zeromv_count; int lf_zeromv_pct; + unsigned char *skin_map; + unsigned char *segmentation_map; signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; int segment_encode_breakout[MAX_MB_SEGMENTS]; @@ -500,12 +509,18 @@ typedef struct VP8_COMP { int mse_source_denoised; int force_maxqp; + int frames_since_last_drop_overshoot; + + // GF update for 1 pass cbr. + int gf_update_onepass_cbr; + int gf_interval_onepass_cbr; + int gf_noboost_onepass_cbr; #if CONFIG_MULTITHREAD /* multithread data */ - int *mt_current_mb_col; + vpx_atomic_int *mt_current_mb_col; int mt_sync_range; - int b_multi_threaded; + vpx_atomic_int b_multi_threaded; int encoding_thread_count; int b_lpf_running; @@ -677,6 +692,9 @@ typedef struct VP8_COMP { int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; } rd_costs; + + // Use the static threshold from ROI settings. + int use_roi_static_threshold; } VP8_COMP; void vp8_initialize_enc(void); diff --git a/libs/libvpx/vp8/encoder/pickinter.c b/libs/libvpx/vp8/encoder/pickinter.c index 7b68d35f50..a9943eb6ab 100644 --- a/libs/libvpx/vp8/encoder/pickinter.c +++ b/libs/libvpx/vp8/encoder/pickinter.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -24,6 +25,7 @@ #include "vp8/common/reconintra4x4.h" #include "vpx_dsp/variance.h" #include "mcomp.h" +#include "vp8/common/vp8_skin_detection.h" #include "rdopt.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" @@ -35,82 +37,9 @@ extern unsigned int cnt_pm; #endif -#define MODEL_MODE 1 - extern const int vp8_ref_frame_order[MAX_MODES]; extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; -// Fixed point implementation of a skin color classifier. Skin color -// is model by a Gaussian distribution in the CbCr color space. -// See ../../test/skin_color_detector_test.cc where the reference -// skin color classifier is defined. - -// Fixed-point skin color model parameters. -static const int skin_mean[5][2] = { { 7463, 9614 }, - { 6400, 10240 }, - { 7040, 10240 }, - { 8320, 9280 }, - { 6800, 9614 } }; -static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 -static const int skin_threshold[6] = { 1570636, 1400000, 800000, - 800000, 800000, 800000 }; // q18 - -// Evaluates the Mahalanobis distance measure for the input CbCr values. -static int evaluate_skin_color_difference(int cb, int cr, int idx) { - const int cb_q6 = cb << 6; - const int cr_q6 = cr << 6; - const int cb_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); - const int cbcr_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); - const int cr_diff_q12 = - (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); - const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; - const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; - const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; - const int skin_diff = - skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + - skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; - return skin_diff; -} - -// Checks if the input yCbCr values corresponds to skin color. -static int is_skin_color(int y, int cb, int cr, int consec_zeromv) { - if (y < 40 || y > 220) { - return 0; - } else { - if (MODEL_MODE == 0) { - return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); - } else { - int i = 0; - // No skin if block has been zero motion for long consecutive time. - if (consec_zeromv > 60) return 0; - // Exit on grey. - if (cb == 128 && cr == 128) return 0; - // Exit on very strong cb. - if (cb > 150 && cr < 110) return 0; - for (; i < 5; ++i) { - int skin_color_diff = evaluate_skin_color_difference(cb, cr, i); - if (skin_color_diff < skin_threshold[i + 1]) { - if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) { - return 0; - } else if (consec_zeromv > 25 && - skin_color_diff > (skin_threshold[i + 1] >> 1)) { - return 0; - } else { - return 1; - } - } - // Exit if difference is much large than the threshold. - if (skin_color_diff > (skin_threshold[i + 1] << 3)) { - return 0; - } - } - return 0; - } - } -} - static int macroblock_corner_grad(unsigned char *signal, int stride, int offsetx, int offsety, int sgnx, int sgny) { @@ -206,8 +135,8 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, (void)mvcost; (void)distortion; (void)sse; - bestmv->as_mv.row <<= 3; - bestmv->as_mv.col <<= 3; + bestmv->as_mv.row *= 8; + bestmv->as_mv.col *= 8; return 0; } @@ -299,8 +228,8 @@ static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) { MODE_INFO *const mic = xd->mode_info_context; const int mis = xd->mode_info_stride; - B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); - int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d); + B_PREDICTION_MODE best_mode = B_MODE_COUNT; + int r = 0, d = 0; if (mb->e_mbd.frame_type == KEY_FRAME) { const B_PREDICTION_MODE A = above_block_mode(mic, i, mis); @@ -313,6 +242,7 @@ static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) { cost += r; distortion += d; + assert(best_mode != B_MODE_COUNT); mic->bmi[i].as_mode = best_mode; /* Break out case where we have already exceeded best so far value @@ -353,7 +283,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) { int Vaverage = 0; int diff; int pred_error[4] = { 0, 0, 0, 0 }, best_error = INT_MAX; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); + MB_PREDICTION_MODE best_mode = MB_MODE_COUNT; for (i = 0; i < 8; ++i) { uleft_col[i] = x->dst.u_buffer[i * x->dst.uv_stride - 1]; @@ -442,6 +372,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) { } } + assert(best_mode != MB_MODE_COUNT); mb->e_mbd.mode_info_context->mbmi.uv_mode = best_mode; } @@ -757,27 +688,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #endif // Check if current macroblock is in skin area. - { - const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] + - x->src.y_buffer[7 * x->src.y_stride + 8] + - x->src.y_buffer[8 * x->src.y_stride + 7] + - x->src.y_buffer[8 * x->src.y_stride + 8]) >> - 2; - const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] + - x->src.u_buffer[3 * x->src.uv_stride + 4] + - x->src.u_buffer[4 * x->src.uv_stride + 3] + - x->src.u_buffer[4 * x->src.uv_stride + 4]) >> - 2; - const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] + - x->src.v_buffer[3 * x->src.uv_stride + 4] + - x->src.v_buffer[4 * x->src.uv_stride + 3] + - x->src.v_buffer[4 * x->src.uv_stride + 4]) >> - 2; - x->is_skin = 0; - if (!cpi->oxcf.screen_content_mode) { - int block_index = mb_row * cpi->common.mb_cols + mb_col; - x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]); - } + x->is_skin = 0; + if (!cpi->oxcf.screen_content_mode) { + int block_index = mb_row * cpi->common.mb_cols + mb_col; + x->is_skin = cpi->skin_map[block_index]; } #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { diff --git a/libs/libvpx/vp8/encoder/picklpf.c b/libs/libvpx/vp8/encoder/picklpf.c index 6f287322ec..b1b712db9a 100644 --- a/libs/libvpx/vp8/encoder/picklpf.c +++ b/libs/libvpx/vp8/encoder/picklpf.c @@ -12,6 +12,7 @@ #include "./vpx_scale_rtcd.h" #include "vp8/common/onyxc_int.h" #include "onyx_int.h" +#include "vp8/encoder/picklpf.h" #include "vp8/encoder/quantize.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpx_scale.h" diff --git a/libs/libvpx/vp8/encoder/picklpf.h b/libs/libvpx/vp8/encoder/picklpf.h new file mode 100644 index 0000000000..e6ad0dbf26 --- /dev/null +++ b/libs/libvpx/vp8/encoder/picklpf.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_ENCODER_PICKLPF_H_ +#define VP8_ENCODER_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct yv12_buffer_config; + +void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd, + struct VP8_COMP *cpi); +void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val); +void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VP8_ENCODER_PICKLPF_H_ diff --git a/libs/libvpx/vp8/encoder/ratectrl.c b/libs/libvpx/vp8/encoder/ratectrl.c index 649f696dc7..e58c310980 100644 --- a/libs/libvpx/vp8/encoder/ratectrl.c +++ b/libs/libvpx/vp8/encoder/ratectrl.c @@ -22,6 +22,7 @@ #include "vp8/common/systemdependent.h" #include "encodemv.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/system_state.h" #define MIN_BPB_FACTOR 0.01 #define MAX_BPB_FACTOR 50 @@ -296,7 +297,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { uint64_t target; /* Clear down mmx registers to allow floating point in what follows */ - vp8_clear_system_state(); + vpx_clear_system_state(); if (cpi->oxcf.fixed_q >= 0) { int Q = cpi->oxcf.key_q; @@ -497,11 +498,9 @@ static void calc_gf_params(VP8_COMP *cpi) { * This is updated once the real frame size/boost is known. */ if (cpi->oxcf.fixed_q == -1) { - if (cpi->pass == 2) /* 2 Pass */ - { + if (cpi->pass == 2) { /* 2 Pass */ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - } else /* 1 Pass */ - { + } else { /* 1 Pass */ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; if (cpi->last_boost > 750) cpi->frames_till_gf_update_due++; @@ -884,61 +883,61 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { /* Adjust target frame size for Golden Frames: */ if (cpi->oxcf.error_resilient_mode == 0 && (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame) { - int Q = - (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + if (!cpi->gf_update_onepass_cbr) { + int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] + : cpi->oxcf.fixed_q; - int gf_frame_useage = 0; /* Golden frame useage since last GF */ - int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + - cpi->recent_ref_frame_usage[LAST_FRAME] + - cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]; + int gf_frame_useage = 0; /* Golden frame useage since last GF */ + int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + + cpi->recent_ref_frame_usage[LAST_FRAME] + + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]; - int pct_gf_active = (100 * cpi->gf_active_count) / - (cpi->common.mb_rows * cpi->common.mb_cols); + int pct_gf_active = (100 * cpi->gf_active_count) / + (cpi->common.mb_rows * cpi->common.mb_cols); - if (tot_mbs) { - gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]) * - 100 / tot_mbs; - } - - if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; - - /* Is a fixed manual GF frequency being used */ - if (cpi->auto_gold) { - /* For one pass throw a GF if recent frame intra useage is - * low or the GF useage is high - */ - if ((cpi->pass == 0) && - (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) { - cpi->common.refresh_golden_frame = 1; - - /* Two pass GF descision */ - } else if (cpi->pass == 2) { - cpi->common.refresh_golden_frame = 1; + if (tot_mbs) { + gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; + } + + if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; + + /* Is a fixed manual GF frequency being used */ + if (cpi->auto_gold) { + /* For one pass throw a GF if recent frame intra useage is + * low or the GF useage is high + */ + if ((cpi->pass == 0) && + (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) { + cpi->common.refresh_golden_frame = 1; + + /* Two pass GF descision */ + } else if (cpi->pass == 2) { + cpi->common.refresh_golden_frame = 1; + } } - } #if 0 - /* Debug stats */ - if (0) - { - FILE *f; + /* Debug stats */ + if (0) { + FILE *f; - f = fopen("gf_useaget.stt", "a"); - fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", - cpi->common.current_video_frame, cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage); - fclose(f); - } + f = fopen("gf_useaget.stt", "a"); + fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", + cpi->common.current_video_frame, cpi->gfu_boost, + GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage); + fclose(f); + } #endif - if (cpi->common.refresh_golden_frame == 1) { + if (cpi->common.refresh_golden_frame == 1) { #if 0 - if (0) - { + if (0) { FILE *f; f = fopen("GFexit.stt", "a"); @@ -948,61 +947,76 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { #endif - if (cpi->auto_adjust_gold_quantizer) { - calc_gf_params(cpi); - } - - /* If we are using alternate ref instead of gf then do not apply the - * boost It will instead be applied to the altref update Jims - * modified boost - */ - if (!cpi->source_alt_ref_active) { - if (cpi->oxcf.fixed_q < 0) { - if (cpi->pass == 2) { - /* The spend on the GF is defined in the two pass - * code for two pass encodes - */ - cpi->this_frame_target = cpi->per_frame_bandwidth; - } else { - int Boost = cpi->last_boost; - int frames_in_section = cpi->frames_till_gf_update_due + 1; - int allocation_chunks = (frames_in_section * 100) + (Boost - 100); - int bits_in_section = cpi->inter_frame_target * frames_in_section; - - /* Normalize Altboost and allocations chunck down to - * prevent overflow - */ - while (Boost > 1000) { - Boost /= 2; - allocation_chunks /= 2; - } - - /* Avoid loss of precision but avoid overflow */ - if ((bits_in_section >> 7) > allocation_chunks) { - cpi->this_frame_target = - Boost * (bits_in_section / allocation_chunks); - } else { - cpi->this_frame_target = - (Boost * bits_in_section) / allocation_chunks; - } - } - } else { - cpi->this_frame_target = - (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) * - cpi->last_boost) / - 100; + if (cpi->auto_adjust_gold_quantizer) { + calc_gf_params(cpi); } - } - /* If there is an active ARF at this location use the minimum - * bits on this frame even if it is a contructed arf. - * The active maximum quantizer insures that an appropriate - * number of bits will be spent if needed for contstructed ARFs. - */ - else { - cpi->this_frame_target = 0; - } + /* If we are using alternate ref instead of gf then do not apply the + * boost It will instead be applied to the altref update Jims + * modified boost + */ + if (!cpi->source_alt_ref_active) { + if (cpi->oxcf.fixed_q < 0) { + if (cpi->pass == 2) { + /* The spend on the GF is defined in the two pass + * code for two pass encodes + */ + cpi->this_frame_target = cpi->per_frame_bandwidth; + } else { + int Boost = cpi->last_boost; + int frames_in_section = cpi->frames_till_gf_update_due + 1; + int allocation_chunks = (frames_in_section * 100) + (Boost - 100); + int bits_in_section = cpi->inter_frame_target * frames_in_section; + /* Normalize Altboost and allocations chunck down to + * prevent overflow + */ + while (Boost > 1000) { + Boost /= 2; + allocation_chunks /= 2; + } + + /* Avoid loss of precision but avoid overflow */ + if ((bits_in_section >> 7) > allocation_chunks) { + cpi->this_frame_target = + Boost * (bits_in_section / allocation_chunks); + } else { + cpi->this_frame_target = + (Boost * bits_in_section) / allocation_chunks; + } + } + } else { + cpi->this_frame_target = + (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) * + cpi->last_boost) / + 100; + } + } else { + /* If there is an active ARF at this location use the minimum + * bits on this frame even if it is a contructed arf. + * The active maximum quantizer insures that an appropriate + * number of bits will be spent if needed for contstructed ARFs. + */ + cpi->this_frame_target = 0; + } + + cpi->current_gf_interval = cpi->frames_till_gf_update_due; + } + } else { + // Special case for 1 pass CBR: fixed gf period. + // TODO(marpan): Adjust this boost/interval logic. + // If gf_cbr_boost_pct is small (below threshold) set the flag + // gf_noboost_onepass_cbr = 1, which forces the gf to use the same + // rate correction factor as last. + cpi->gf_noboost_onepass_cbr = (cpi->oxcf.gf_cbr_boost_pct <= 100); + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + // Skip this update if the zero_mvcount is low. + if (cpi->zeromv_count > (cpi->common.MBs >> 1)) { + cpi->common.refresh_golden_frame = 1; + cpi->this_frame_target = + (cpi->this_frame_target * (100 + cpi->oxcf.gf_cbr_boost_pct)) / 100; + } + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; cpi->current_gf_interval = cpi->frames_till_gf_update_due; } } @@ -1019,13 +1033,14 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { int projected_size_based_on_q = 0; /* Clear down mmx registers to allow floating point in what follows */ - vp8_clear_system_state(); + vpx_clear_system_state(); if (cpi->common.frame_type == KEY_FRAME) { rate_correction_factor = cpi->key_frame_rate_correction_factor; } else { - if (cpi->oxcf.number_of_layers == 1 && (cpi->common.refresh_alt_ref_frame || - cpi->common.refresh_golden_frame)) { + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && + (cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame)) { rate_correction_factor = cpi->gf_rate_correction_factor; } else { rate_correction_factor = cpi->rate_correction_factor; @@ -1101,8 +1116,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { if (cpi->common.frame_type == KEY_FRAME) { cpi->key_frame_rate_correction_factor = rate_correction_factor; } else { - if (cpi->oxcf.number_of_layers == 1 && (cpi->common.refresh_alt_ref_frame || - cpi->common.refresh_golden_frame)) { + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && + (cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame)) { cpi->gf_rate_correction_factor = rate_correction_factor; } else { cpi->rate_correction_factor = rate_correction_factor; @@ -1117,7 +1133,6 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { cpi->active_worst_quality = cpi->worst_quality; return cpi->worst_quality; } - /* Reset Zbin OQ value */ cpi->mb.zbin_over_quant = 0; @@ -1127,10 +1142,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { if (cpi->common.frame_type == KEY_FRAME) { Q = cpi->oxcf.key_q; } else if (cpi->oxcf.number_of_layers == 1 && - cpi->common.refresh_alt_ref_frame) { + cpi->common.refresh_alt_ref_frame && + !cpi->gf_noboost_onepass_cbr) { Q = cpi->oxcf.alt_q; } else if (cpi->oxcf.number_of_layers == 1 && - cpi->common.refresh_golden_frame) { + cpi->common.refresh_golden_frame && + !cpi->gf_noboost_onepass_cbr) { Q = cpi->oxcf.gold_q; } } else { @@ -1144,7 +1161,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { if (cpi->common.frame_type == KEY_FRAME) { correction_factor = cpi->key_frame_rate_correction_factor; } else { - if (cpi->oxcf.number_of_layers == 1 && + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)) { correction_factor = cpi->gf_rate_correction_factor; @@ -1198,6 +1215,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { if (cpi->common.frame_type == KEY_FRAME) { zbin_oqmax = 0; } else if (cpi->oxcf.number_of_layers == 1 && + !cpi->gf_noboost_onepass_cbr && (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))) { @@ -1302,7 +1320,7 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi) { void vp8_adjust_key_frame_context(VP8_COMP *cpi) { /* Clear down mmx registers to allow floating point in what follows */ - vp8_clear_system_state(); + vpx_clear_system_state(); /* Do we have any key frame overspend to recover? */ /* Two-pass overspend handled elsewhere. */ @@ -1422,12 +1440,33 @@ int vp8_pick_frame_size(VP8_COMP *cpi) { // If this just encoded frame (mcomp/transform/quant, but before loopfilter and // pack_bitstream) has large overshoot, and was not being encoded close to the // max QP, then drop this frame and force next frame to be encoded at max QP. -// Condition this on 1 pass CBR with screen content mode and frame dropper off. +// Allow this for screen_content_mode = 2, or if drop frames is allowed. // TODO(marpan): Should do this exit condition during the encode_frame // (i.e., halfway during the encoding of the frame) to save cycles. int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { - if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && - cpi->drop_frames_allowed == 0 && cpi->common.frame_type != KEY_FRAME) { + int force_drop_overshoot = 0; +#if CONFIG_MULTI_RES_ENCODING + // Only check for dropping due to overshoot on the lowest stream. + // If the lowest stream of the multi-res encoding was dropped due to + // overshoot, then force dropping on all upper layer streams + // (mr_encoder_id > 0). + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; + if (cpi->oxcf.mr_total_resolutions > 1 && cpi->oxcf.mr_encoder_id > 0) { + force_drop_overshoot = low_res_frame_info->is_frame_dropped_overshoot_maxqp; + if (!force_drop_overshoot) { + cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; + return 0; + } + } +#endif + if (cpi->common.frame_type != KEY_FRAME && + (cpi->oxcf.screen_content_mode == 2 || + (cpi->drop_frames_allowed && + (force_drop_overshoot || + (cpi->rate_correction_factor < (4.0f * MIN_BPB_FACTOR) && + cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) { // Note: the "projected_frame_size" from encode_frame() only gives estimate // of mode/motion vector rate (in non-rd mode): so below we only require // that projected_frame_size is somewhat greater than per-frame-bandwidth, @@ -1438,17 +1477,20 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { // Rate threshold, in bytes. int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3); // Threshold for the average (over all macroblocks) of the pixel-sum - // residual error over 16x16 block. Should add QP dependence on threshold? - int thresh_pred_err_mb = (256 << 4); + // residual error over 16x16 block. + int thresh_pred_err_mb = (200 << 4); int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs); - if (Q < thresh_qp && cpi->projected_frame_size > thresh_rate && - pred_err_mb > thresh_pred_err_mb) { + // Reduce/ignore thresh_rate if pred_err_mb much larger than its threshold, + // give more weight to pred_err metric for overshoot detection. + if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4)) + thresh_rate = thresh_rate >> 3; + if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate && + pred_err_mb > thresh_pred_err_mb) || + force_drop_overshoot) { + unsigned int i; double new_correction_factor; - const int target_size = cpi->av_per_frame_bandwidth; int target_bits_per_mb; - // Drop this frame: advance frame counters, and set force_maxqp flag. - cpi->common.current_video_frame++; - cpi->frames_since_key++; + const int target_size = cpi->av_per_frame_bandwidth; // Flag to indicate we will force next frame to be encoded at max QP. cpi->force_maxqp = 1; // Reset the buffer levels. @@ -1479,14 +1521,40 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { if (cpi->rate_correction_factor > MAX_BPB_FACTOR) { cpi->rate_correction_factor = MAX_BPB_FACTOR; } + // Drop this frame: update frame counters. + cpi->common.current_video_frame++; + cpi->frames_since_key++; + cpi->temporal_pattern_counter++; + cpi->frames_since_last_drop_overshoot = 0; + if (cpi->oxcf.number_of_layers > 1) { + // Set max_qp and rate correction for all temporal layers if overshoot + // is detected. + for (i = 0; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->force_maxqp = 1; + lc->frames_since_last_drop_overshoot = 0; + lc->rate_correction_factor = cpi->rate_correction_factor; + } + } +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 1; +#endif return 1; - } else { - cpi->force_maxqp = 0; - return 0; } cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0; +#endif return 0; } cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0; +#endif return 0; } diff --git a/libs/libvpx/vp8/encoder/rdopt.c b/libs/libvpx/vp8/encoder/rdopt.c index 9ba301e080..e210b44105 100644 --- a/libs/libvpx/vp8/encoder/rdopt.c +++ b/libs/libvpx/vp8/encoder/rdopt.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include @@ -15,12 +16,14 @@ #include "vpx_config.h" #include "vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" +#include "encodeframe.h" #include "tokenize.h" #include "treewriter.h" #include "onyx_int.h" #include "modecosts.h" #include "encodeintra.h" #include "pickinter.h" +#include "vp8/common/common.h" #include "vp8/common/entropymode.h" #include "vp8/common/reconinter.h" #include "vp8/common/reconintra.h" @@ -30,6 +33,7 @@ #include "encodemb.h" #include "vp8/encoder/quantize.h" #include "vpx_dsp/variance.h" +#include "vpx_ports/system_state.h" #include "mcomp.h" #include "rdopt.h" #include "vpx_mem/vpx_mem.h" @@ -105,11 +109,10 @@ const int vp8_ref_frame_order[MAX_MODES] = { 0, }; -static void fill_token_costs(int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS], - const vp8_prob p[BLOCK_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS] - [ENTROPY_NODES]) { +static void fill_token_costs( + int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS], + const vp8_prob p[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES]) { int i, j, k; for (i = 0; i < BLOCK_TYPES; ++i) { @@ -163,7 +166,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue) { double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0; double rdconst = 2.80; - vp8_clear_system_state(); + vpx_clear_system_state(); /* Further tests required to see if optimum is different * for key frames, golden frames and arf frames. @@ -607,9 +610,8 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y, for (i = 0; i < 16; ++i) { MODE_INFO *const mic = xd->mode_info_context; const int mis = xd->mode_info_stride; - B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); - int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), - UNINITIALIZED_IS_SAFE(d); + B_PREDICTION_MODE best_mode = B_MODE_COUNT; + int r = 0, ry = 0, d = 0; if (mb->e_mbd.frame_type == KEY_FRAME) { const B_PREDICTION_MODE A = above_block_mode(mic, i, mis); @@ -626,6 +628,7 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y, distortion += d; tot_rate_y += ry; + assert(best_mode != B_MODE_COUNT); mic->bmi[i].as_mode = best_mode; if (total_rd >= (int64_t)best_rd) break; @@ -643,7 +646,7 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y, static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y, int *Distortion) { MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); + MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT; int rate, ratey; int distortion; int best_rd = INT_MAX; @@ -673,6 +676,7 @@ static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y, } } + assert(mode_selected != MB_MODE_COUNT); xd->mode_info_context->mbmi.mode = mode_selected; return best_rd; } @@ -740,9 +744,9 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion) { MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); + MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT; int best_rd = INT_MAX; - int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r); + int d = 0, r = 0; int rate_to; MACROBLOCKD *xd = &x->e_mbd; @@ -786,6 +790,7 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate, *rate = r; *distortion = d; + assert(mode_selected != MB_MODE_COUNT); xd->mode_info_context->mbmi.uv_mode = mode_selected; } @@ -849,8 +854,7 @@ static int labels2mode(MACROBLOCK *x, int const *labelings, int which_label, default: break; } - if (m == ABOVE4X4) /* replace above with left if same */ - { + if (m == ABOVE4X4) { /* replace above with left if same */ int_mv left_mv; left_mv.as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i); @@ -956,19 +960,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, vp8_variance_fn_ptr_t *v_fn_ptr; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta; - ENTROPY_CONTEXT *tl; ENTROPY_CONTEXT_PLANES t_above_b, t_left_b; - ENTROPY_CONTEXT *ta_b; - ENTROPY_CONTEXT *tl_b; memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - ta_b = (ENTROPY_CONTEXT *)&t_above_b; - tl_b = (ENTROPY_CONTEXT *)&t_left_b; + vp8_zero(t_above_b); + vp8_zero(t_left_b); br = 0; bd = 0; @@ -1148,13 +1146,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, mode_selected = this_mode; best_label_rd = this_rd; - memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES)); } } /*for each 4x4 mode*/ - memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES)); + memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES)); labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected], bsi->ref_mv, x->mvcost); diff --git a/libs/libvpx/vp8/encoder/rdopt.h b/libs/libvpx/vp8/encoder/rdopt.h index 8186ff1051..960bd8f1cd 100644 --- a/libs/libvpx/vp8/encoder/rdopt.h +++ b/libs/libvpx/vp8/encoder/rdopt.h @@ -19,6 +19,9 @@ extern "C" { #define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D)) +void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); +void vp8_auto_select_speed(VP8_COMP *cpi); + static INLINE void insertsortmv(int arr[], int len) { int i, j, k; diff --git a/libs/libvpx/vp8/encoder/temporal_filter.c b/libs/libvpx/vp8/encoder/temporal_filter.c index 1b2f46bb69..0a7d25fb0a 100644 --- a/libs/libvpx/vp8/encoder/temporal_filter.c +++ b/libs/libvpx/vp8/encoder/temporal_filter.c @@ -20,6 +20,7 @@ #include "ratectrl.h" #include "vp8/common/quant_common.h" #include "segmentation.h" +#include "temporal_filter.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/swapyv12buffer.h" #include "vp8/common/threading.h" diff --git a/libs/libvpx/vp8/encoder/temporal_filter.h b/libs/libvpx/vp8/encoder/temporal_filter.h new file mode 100644 index 0000000000..865d909fb6 --- /dev/null +++ b/libs/libvpx/vp8/encoder/temporal_filter.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_ +#define VP8_ENCODER_TEMPORAL_FILTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance); + +#ifdef __cplusplus +} +#endif + +#endif // VP8_ENCODER_TEMPORAL_FILTER_H_ diff --git a/libs/libvpx/vp8/encoder/x86/dct_mmx.asm b/libs/libvpx/vp8/encoder/x86/dct_mmx.asm deleted file mode 100644 index 6f188cb94a..0000000000 --- a/libs/libvpx/vp8/encoder/x86/dct_mmx.asm +++ /dev/null @@ -1,241 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) -global sym(vp8_short_fdct4x4_mmx) PRIVATE -sym(vp8_short_fdct4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ; input - mov rdi, arg(1) ; output - - movsxd rax, dword ptr arg(2) ;pitch - - lea rcx, [rsi + rax*2] - ; read the input data - movq mm0, [rsi] - movq mm1, [rsi + rax] - - movq mm2, [rcx] - movq mm4, [rcx + rax] - - ; transpose for the first stage - movq mm3, mm0 ; 00 01 02 03 - movq mm5, mm2 ; 20 21 22 23 - - punpcklwd mm0, mm1 ; 00 10 01 11 - punpckhwd mm3, mm1 ; 02 12 03 13 - - punpcklwd mm2, mm4 ; 20 30 21 31 - punpckhwd mm5, mm4 ; 22 32 23 33 - - movq mm1, mm0 ; 00 10 01 11 - punpckldq mm0, mm2 ; 00 10 20 30 - - punpckhdq mm1, mm2 ; 01 11 21 31 - - movq mm2, mm3 ; 02 12 03 13 - punpckldq mm2, mm5 ; 02 12 22 32 - - punpckhdq mm3, mm5 ; 03 13 23 33 - - ; mm0 0 - ; mm1 1 - ; mm2 2 - ; mm3 3 - - ; first stage - movq mm5, mm0 - movq mm4, mm1 - - paddw mm0, mm3 ; a1 = 0 + 3 - paddw mm1, mm2 ; b1 = 1 + 2 - - psubw mm4, mm2 ; c1 = 1 - 2 - psubw mm5, mm3 ; d1 = 0 - 3 - - psllw mm5, 3 - psllw mm4, 3 - - psllw mm0, 3 - psllw mm1, 3 - - ; output 0 and 2 - movq mm2, mm0 ; a1 - - paddw mm0, mm1 ; op[0] = a1 + b1 - psubw mm2, mm1 ; op[2] = a1 - b1 - - ; output 1 and 3 - ; interleave c1, d1 - movq mm1, mm5 ; d1 - punpcklwd mm1, mm4 ; c1 d1 - punpckhwd mm5, mm4 ; c1 d1 - - movq mm3, mm1 - movq mm4, mm5 - - pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - - pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - - paddd mm1, MMWORD PTR[GLOBAL(_14500)] - paddd mm4, MMWORD PTR[GLOBAL(_14500)] - paddd mm3, MMWORD PTR[GLOBAL(_7500)] - paddd mm5, MMWORD PTR[GLOBAL(_7500)] - - psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 - psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 - psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 - psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 - - packssdw mm1, mm4 ; op[1] - packssdw mm3, mm5 ; op[3] - - ; done with vertical - ; transpose for the second stage - movq mm4, mm0 ; 00 10 20 30 - movq mm5, mm2 ; 02 12 22 32 - - punpcklwd mm0, mm1 ; 00 01 10 11 - punpckhwd mm4, mm1 ; 20 21 30 31 - - punpcklwd mm2, mm3 ; 02 03 12 13 - punpckhwd mm5, mm3 ; 22 23 32 33 - - movq mm1, mm0 ; 00 01 10 11 - punpckldq mm0, mm2 ; 00 01 02 03 - - punpckhdq mm1, mm2 ; 01 22 12 13 - - movq mm2, mm4 ; 20 31 30 31 - punpckldq mm2, mm5 ; 20 21 22 23 - - punpckhdq mm4, mm5 ; 30 31 32 33 - - ; mm0 0 - ; mm1 1 - ; mm2 2 - ; mm3 4 - - movq mm5, mm0 - movq mm3, mm1 - - paddw mm0, mm4 ; a1 = 0 + 3 - paddw mm1, mm2 ; b1 = 1 + 2 - - psubw mm3, mm2 ; c1 = 1 - 2 - psubw mm5, mm4 ; d1 = 0 - 3 - - pxor mm6, mm6 ; zero out for compare - - pcmpeqw mm6, mm5 ; d1 != 0 - - pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper, - ; and keep bit 0 of lower - - ; output 0 and 2 - movq mm2, mm0 ; a1 - - paddw mm0, mm1 ; a1 + b1 - psubw mm2, mm1 ; a1 - b1 - - paddw mm0, MMWORD PTR[GLOBAL(_7w)] - paddw mm2, MMWORD PTR[GLOBAL(_7w)] - - psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4 - psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4 - - movq MMWORD PTR[rdi + 0 ], mm0 - movq MMWORD PTR[rdi + 16], mm2 - - ; output 1 and 3 - ; interleave c1, d1 - movq mm1, mm5 ; d1 - punpcklwd mm1, mm3 ; c1 d1 - punpckhwd mm5, mm3 ; c1 d1 - - movq mm3, mm1 - movq mm4, mm5 - - pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - - pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - - paddd mm1, MMWORD PTR[GLOBAL(_12000)] - paddd mm4, MMWORD PTR[GLOBAL(_12000)] - paddd mm3, MMWORD PTR[GLOBAL(_51000)] - paddd mm5, MMWORD PTR[GLOBAL(_51000)] - - psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 - psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 - psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 - psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 - - packssdw mm1, mm4 ; op[4] - packssdw mm3, mm5 ; op[12] - - paddw mm1, mm6 ; op[4] += (d1!=0) - - movq MMWORD PTR[rdi + 8 ], mm1 - movq MMWORD PTR[rdi + 24], mm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 8 -_5352_2217: - dw 5352 - dw 2217 - dw 5352 - dw 2217 -align 8 -_2217_neg5352: - dw 2217 - dw -5352 - dw 2217 - dw -5352 -align 8 -_cmp_mask: - times 4 dw 1 -align 8 -_7w: - times 4 dw 7 -align 8 -_14500: - times 2 dd 14500 -align 8 -_7500: - times 2 dd 7500 -align 8 -_12000: - times 2 dd 12000 -align 8 -_51000: - times 2 dd 51000 diff --git a/libs/libvpx/vp8/encoder/x86/dct_sse2.asm b/libs/libvpx/vp8/encoder/x86/dct_sse2.asm index d06bca5927..4d92f0341d 100644 --- a/libs/libvpx/vp8/encoder/x86/dct_sse2.asm +++ b/libs/libvpx/vp8/encoder/x86/dct_sse2.asm @@ -60,6 +60,8 @@ ret %endmacro +SECTION .text + ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) global sym(vp8_short_fdct4x4_sse2) PRIVATE sym(vp8_short_fdct4x4_sse2): diff --git a/libs/libvpx/vp8/encoder/x86/encodeopt.asm b/libs/libvpx/vp8/encoder/x86/encodeopt.asm index fe26b18e56..f6c6aeae74 100644 --- a/libs/libvpx/vp8/encoder/x86/encodeopt.asm +++ b/libs/libvpx/vp8/encoder/x86/encodeopt.asm @@ -11,9 +11,11 @@ %include "vpx_ports/x86_abi_support.asm" -;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) -global sym(vp8_block_error_xmm) PRIVATE -sym(vp8_block_error_xmm): +SECTION .text + +;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr) +global sym(vp8_block_error_sse2) PRIVATE +sym(vp8_block_error_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 @@ -59,152 +61,9 @@ sym(vp8_block_error_xmm): pop rbp ret -;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) -global sym(vp8_block_error_mmx) PRIVATE -sym(vp8_block_error_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - pxor mm7, mm7 - - mov rdi, arg(1) ;dcoef_ptr - movq mm3, [rsi] - - movq mm4, [rdi] - movq mm5, [rsi+8] - - movq mm6, [rdi+8] - pxor mm1, mm1 ; from movd mm1, dc ; dc =0 - - movq mm2, mm7 - psubw mm5, mm6 - - por mm1, mm2 - pmaddwd mm5, mm5 - - pcmpeqw mm1, mm7 - psubw mm3, mm4 - - pand mm1, mm3 - pmaddwd mm1, mm1 - - paddd mm1, mm5 - movq mm3, [rsi+16] - - movq mm4, [rdi+16] - movq mm5, [rsi+24] - - movq mm6, [rdi+24] - psubw mm5, mm6 - - pmaddwd mm5, mm5 - psubw mm3, mm4 - - pmaddwd mm3, mm3 - paddd mm3, mm5 - - paddd mm1, mm3 - movq mm0, mm1 - - psrlq mm1, 32 - paddd mm0, mm1 - - movq rax, mm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -global sym(vp8_mbblock_error_mmx_impl) PRIVATE -sym(vp8_mbblock_error_mmx_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - pxor mm7, mm7 - - mov rdi, arg(1) ;dcoef_ptr - pxor mm2, mm2 - - movd mm1, dword ptr arg(2) ;dc - por mm1, mm2 - - pcmpeqw mm1, mm7 - mov rcx, 16 - -.mberror_loop_mmx: - movq mm3, [rsi] - movq mm4, [rdi] - - movq mm5, [rsi+8] - movq mm6, [rdi+8] - - - psubw mm5, mm6 - pmaddwd mm5, mm5 - - psubw mm3, mm4 - pand mm3, mm1 - - pmaddwd mm3, mm3 - paddd mm2, mm5 - - paddd mm2, mm3 - movq mm3, [rsi+16] - - movq mm4, [rdi+16] - movq mm5, [rsi+24] - - movq mm6, [rdi+24] - psubw mm5, mm6 - - pmaddwd mm5, mm5 - psubw mm3, mm4 - - pmaddwd mm3, mm3 - paddd mm2, mm5 - - paddd mm2, mm3 - add rsi, 32 - - add rdi, 32 - sub rcx, 1 - - jnz .mberror_loop_mmx - - movq mm0, mm2 - psrlq mm2, 32 - - paddd mm0, mm2 - movq rax, mm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -global sym(vp8_mbblock_error_xmm_impl) PRIVATE -sym(vp8_mbblock_error_xmm_impl): +;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +global sym(vp8_mbblock_error_sse2_impl) PRIVATE +sym(vp8_mbblock_error_sse2_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 @@ -272,66 +131,9 @@ sym(vp8_mbblock_error_xmm_impl): ret -;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); -global sym(vp8_mbuverror_mmx_impl) PRIVATE -sym(vp8_mbuverror_mmx_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;s_ptr - mov rdi, arg(1) ;d_ptr - - mov rcx, 16 - pxor mm7, mm7 - -.mbuverror_loop_mmx: - - movq mm1, [rsi] - movq mm2, [rdi] - - psubw mm1, mm2 - pmaddwd mm1, mm1 - - - movq mm3, [rsi+8] - movq mm4, [rdi+8] - - psubw mm3, mm4 - pmaddwd mm3, mm3 - - - paddd mm7, mm1 - paddd mm7, mm3 - - - add rsi, 16 - add rdi, 16 - - dec rcx - jnz .mbuverror_loop_mmx - - movq mm0, mm7 - psrlq mm7, 32 - - paddd mm0, mm7 - movq rax, mm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); -global sym(vp8_mbuverror_xmm_impl) PRIVATE -sym(vp8_mbuverror_xmm_impl): +;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr); +global sym(vp8_mbuverror_sse2_impl) PRIVATE +sym(vp8_mbuverror_sse2_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 diff --git a/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm index f4989279f4..b5d5de4a59 100644 --- a/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm +++ b/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) global sym(vp8_short_walsh4x4_sse2) PRIVATE sym(vp8_short_walsh4x4_sse2): diff --git a/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm b/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm deleted file mode 100644 index 2864ce16d9..0000000000 --- a/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm +++ /dev/null @@ -1,286 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; short *scan_mask, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr); -global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE -sym(vp8_fast_quantize_b_impl_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - movq mm0, [rsi] - - mov rax, arg(1) ;zbin_ptr - movq mm1, [rax] - - movq mm3, mm0 - psraw mm0, 15 - - pxor mm3, mm0 - psubw mm3, mm0 ; abs - - movq mm2, mm3 - pcmpgtw mm1, mm2 - - pandn mm1, mm2 - movq mm3, mm1 - - mov rdx, arg(6) ;quant_ptr - movq mm1, [rdx] - - mov rcx, arg(5) ;round_ptr - movq mm2, [rcx] - - paddw mm3, mm2 - pmulhuw mm3, mm1 - - pxor mm3, mm0 - psubw mm3, mm0 ;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - movq mm0, mm3 - - movq [rdi], mm3 - - mov rax, arg(3) ;dequant_ptr - movq mm2, [rax] - - pmullw mm3, mm2 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax], mm3 - - ; next 8 - movq mm4, [rsi+8] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+8] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+8] - movq mm6, [rcx+8] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+8], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+8] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+8], mm7 - - - ; next 8 - movq mm4, [rsi+16] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+16] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+16] - movq mm6, [rcx+16] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+16], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+16] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+16], mm7 - - - ; next 8 - movq mm4, [rsi+24] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+24] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+24] - movq mm6, [rcx+24] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+24], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+24] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+24], mm7 - - - - mov rdi, arg(4) ;scan_mask - mov rsi, arg(2) ;qcoeff_ptr - - pxor mm5, mm5 - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rsi+8] - - movq mm2, [rdi] - movq mm3, [rdi+8]; - - pcmpeqw mm0, mm7 - pcmpeqw mm1, mm7 - - pcmpeqw mm6, mm6 - pxor mm0, mm6 - - pxor mm1, mm6 - psrlw mm0, 15 - - psrlw mm1, 15 - pmaddwd mm0, mm2 - - pmaddwd mm1, mm3 - movq mm5, mm0 - - paddd mm5, mm1 - - movq mm0, [rsi+16] - movq mm1, [rsi+24] - - movq mm2, [rdi+16] - movq mm3, [rdi+24]; - - pcmpeqw mm0, mm7 - pcmpeqw mm1, mm7 - - pcmpeqw mm6, mm6 - pxor mm0, mm6 - - pxor mm1, mm6 - psrlw mm0, 15 - - psrlw mm1, 15 - pmaddwd mm0, mm2 - - pmaddwd mm1, mm3 - paddd mm5, mm0 - - paddd mm5, mm1 - movq mm0, mm5 - - psrlq mm5, 32 - paddd mm0, mm5 - - ; eob adjustment begins here - movq rcx, mm0 - and rcx, 0xffff - - xor rdx, rdx - sub rdx, rcx ; rdx=-rcx - - bsr rax, rcx - inc rax - - sar rdx, 31 - and rax, rdx - ; Substitute the sse assembly for the old mmx mixed assembly/C. The - ; following is kept as reference - ; movq rcx, mm0 - ; bsr rax, rcx - ; - ; mov eob, rax - ; mov eee, rcx - ; - ;if(eee==0) - ;{ - ; eob=-1; - ;} - ;else if(eee<0) - ;{ - ; eob=15; - ;} - ;d->eob = eob+1; - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm index bd92b398a0..d2b4711b87 100644 --- a/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ; void vp8_temporal_filter_apply_sse2 | arg ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 @@ -203,5 +205,5 @@ align 16 _const_top_bit: times 8 dw 1<<15 align 16 -_const_16w +_const_16w: times 8 dw 16 diff --git a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c deleted file mode 100644 index d00abd4d08..0000000000 --- a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vpx_ports/x86.h" -#include "vp8/encoder/block.h" - -void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); -void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) { - vp8_short_fdct4x4_mmx(input, output, pitch); - vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); -} - -int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, - short *qcoeff_ptr, short *dequant_ptr, - const short *scan_mask, short *round_ptr, - short *quant_ptr, short *dqcoeff_ptr); -void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) { - const short *scan_mask = vp8_default_zig_zag_mask; - short *coeff_ptr = b->coeff; - short *zbin_ptr = b->zbin; - short *round_ptr = b->round; - short *quant_ptr = b->quant_fast; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - - *d->eob = (char)vp8_fast_quantize_b_impl_mmx( - coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask, - - round_ptr, quant_ptr, dqcoeff_ptr); -} - -int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) { - short *coeff_ptr = mb->block[0].coeff; - short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; - return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); -} - -int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); -int vp8_mbuverror_mmx(MACROBLOCK *mb) { - short *s_ptr = &mb->coeff[256]; - short *d_ptr = &mb->e_mbd.dqcoeff[256]; - return vp8_mbuverror_mmx_impl(s_ptr, d_ptr); -} diff --git a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c index 29eb1f1b45..d0752453ee 100644 --- a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c +++ b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c @@ -13,16 +13,16 @@ #include "vpx_ports/x86.h" #include "vp8/encoder/block.h" -int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) { +int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +int vp8_mbblock_error_sse2(MACROBLOCK *mb, int dc) { short *coeff_ptr = mb->block[0].coeff; short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; - return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); + return vp8_mbblock_error_sse2_impl(coeff_ptr, dcoef_ptr, dc); } -int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); -int vp8_mbuverror_xmm(MACROBLOCK *mb) { +int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr); +int vp8_mbuverror_sse2(MACROBLOCK *mb) { short *s_ptr = &mb->coeff[256]; short *d_ptr = &mb->e_mbd.dqcoeff[256]; - return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); + return vp8_mbuverror_sse2_impl(s_ptr, d_ptr); } diff --git a/libs/libvpx/vp8/encoder/x86/quantize_ssse3.c b/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c similarity index 99% rename from libs/libvpx/vp8/encoder/x86/quantize_ssse3.c rename to libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c index 322f0a151f..d547450154 100644 --- a/libs/libvpx/vp8/encoder/x86/quantize_ssse3.c +++ b/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c @@ -10,6 +10,7 @@ #include /* SSSE3 */ +#include "./vp8_rtcd.h" #include "vp8/encoder/block.h" /* bitscan reverse (bsr) */ diff --git a/libs/libvpx/vp8/vp8_common.mk b/libs/libvpx/vp8/vp8_common.mk index 886b127d6a..246fe6a677 100644 --- a/libs/libvpx/vp8/vp8_common.mk +++ b/libs/libvpx/vp8/vp8_common.mk @@ -80,8 +80,6 @@ VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm -VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm -VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_loopfilter_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm @@ -118,6 +116,14 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h +# common (c) +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c + ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c endif diff --git a/libs/libvpx/vp8/vp8_cx_iface.c b/libs/libvpx/vp8/vp8_cx_iface.c index fac237eec0..af6689fd97 100644 --- a/libs/libvpx/vp8/vp8_cx_iface.c +++ b/libs/libvpx/vp8/vp8_cx_iface.c @@ -40,6 +40,7 @@ struct vp8_extracfg { vp8e_tuning tuning; unsigned int cq_level; /* constrained quality level */ unsigned int rc_max_intra_bitrate_pct; + unsigned int gf_cbr_boost_pct; unsigned int screen_content_mode; }; @@ -65,6 +66,7 @@ static struct vp8_extracfg default_extracfg = { 0, /* tuning*/ 10, /* cq_level */ 0, /* rc_max_intra_bitrate_pct */ + 0, /* gf_cbr_boost_pct */ 0, /* screen_content_mode */ }; @@ -315,6 +317,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, oxcf->target_bandwidth = cfg.rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; + oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct; oxcf->best_allowed_q = cfg.rc_min_quantizer; oxcf->worst_allowed_q = cfg.rc_max_quantizer; @@ -558,6 +561,13 @@ static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx, return update_extracfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.gf_cbr_boost_pct = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args); + return update_extracfg(ctx, &extra_cfg); +} + static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp8_extracfg extra_cfg = ctx->vp8_cfg; @@ -1159,6 +1169,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { { VP8E_SET_CQ_LEVEL, set_cq_level }, { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct }, { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode }, + { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, { -1, NULL }, }; @@ -1205,6 +1216,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { 50, /* rc_two_pass_vbrbias */ 0, /* rc_two_pass_vbrmin_section */ 400, /* rc_two_pass_vbrmax_section */ + 0, // rc_2pass_vbr_corpus_complexity (only has meaningfull for VP9) /* keyframing settings (kf) */ VPX_KF_AUTO, /* g_kfmode*/ diff --git a/libs/libvpx/vp8/vp8_dx_iface.c b/libs/libvpx/vp8/vp8_dx_iface.c index b1f8340d67..f20283c1e1 100644 --- a/libs/libvpx/vp8/vp8_dx_iface.c +++ b/libs/libvpx/vp8/vp8_dx_iface.c @@ -24,6 +24,7 @@ #include "decoder/onyxd_int.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/system_state.h" #if CONFIG_ERROR_CONCEALMENT #include "decoder/error_concealment.h" #endif @@ -79,7 +80,6 @@ static int vp8_init_ctx(vpx_codec_ctx_t *ctx) { static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data) { vpx_codec_err_t res = VPX_CODEC_OK; - vpx_codec_alg_priv_t *priv = NULL; (void)data; vp8_rtcd(); @@ -91,7 +91,10 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, * information becomes known. */ if (!ctx->priv) { + vpx_codec_alg_priv_t *priv; + if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR; + priv = (vpx_codec_alg_priv_t *)ctx->priv; /* initialize number of fragments to zero */ @@ -101,22 +104,6 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS); /*post processing level initialized to do nothing */ - } else { - priv = (vpx_codec_alg_priv_t *)ctx->priv; - } - - priv->yv12_frame_buffers.use_frame_threads = - (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING); - - /* for now, disable frame threading */ - priv->yv12_frame_buffers.use_frame_threads = 0; - - if (priv->yv12_frame_buffers.use_frame_threads && - ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) || - (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) { - /* row-based threading, error concealment, and input fragments will - * not be supported when using frame-based threading */ - res = VPX_CODEC_INVALID_PARAM; } return res; @@ -157,8 +144,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, } si->is_kf = 0; - if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */ - { + if (data_sz >= 10 && !(clear[0] & 0x01)) { /* I-Frame */ si->is_kf = 1; /* vet via sync code */ @@ -241,7 +227,8 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, } static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, - unsigned int data_sz, vpx_codec_err_t *res) { + unsigned int data_sz, + volatile vpx_codec_err_t *res) { *res = VPX_CODEC_OK; if (ctx->fragments.count == 0) { @@ -280,7 +267,7 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; + volatile vpx_codec_err_t res; unsigned int resolution_change = 0; unsigned int w, h; @@ -365,7 +352,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, * reallocation is attempted on resync. */ ctx->si.w = 0; ctx->si.h = 0; - vp8_clear_system_state(); + vpx_clear_system_state(); /* same return value as used in vp8dx_receive_compressed_data */ return -1; } @@ -427,7 +414,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, #endif #if CONFIG_MULTITHREAD - if (pbi->b_multithreaded_rd) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows); } #else @@ -518,7 +505,7 @@ static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); - if (data && !ctx->yv12_frame_buffers.use_frame_threads) { + if (data) { vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; @@ -535,7 +522,7 @@ static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); - if (data && !ctx->yv12_frame_buffers.use_frame_threads) { + if (data) { vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; @@ -548,6 +535,14 @@ static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx, } } +static vpx_codec_err_t vp8_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = vp8dx_get_quantizer(ctx->yv12_frame_buffers.pbi[0]); + return VPX_CODEC_OK; +} + static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, va_list args) { #if CONFIG_POSTPROC @@ -572,7 +567,7 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, va_list args) { int *update_info = va_arg(args, int *); - if (update_info && !ctx->yv12_frame_buffers.use_frame_threads) { + if (update_info) { VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; *update_info = pbi->common.refresh_alt_ref_frame * (int)VP8_ALTR_FRAME + @@ -585,12 +580,11 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, } } -extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame); static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx, va_list args) { int *ref_info = va_arg(args, int *); - if (ref_info && !ctx->yv12_frame_buffers.use_frame_threads) { + if (ref_info) { VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; VP8_COMMON *oci = &pbi->common; *ref_info = @@ -640,6 +634,7 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = { { VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates }, { VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted }, { VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame }, + { VPXD_GET_LAST_QUANTIZER, vp8_get_quantizer }, { VPXD_SET_DECRYPTOR, vp8_set_decryptor }, { -1, NULL }, }; diff --git a/libs/libvpx/vp8/vp8cx.mk b/libs/libvpx/vp8/vp8cx.mk index 4e3d31e3b7..0dac0169d5 100644 --- a/libs/libvpx/vp8/vp8cx.mk +++ b/libs/libvpx/vp8/vp8cx.mk @@ -30,6 +30,7 @@ VP8_CX_SRCS-yes += encoder/encodeintra.c VP8_CX_SRCS-yes += encoder/encodemb.c VP8_CX_SRCS-yes += encoder/encodemv.c VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c +VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h VP8_CX_SRCS-yes += encoder/firstpass.c VP8_CX_SRCS-yes += encoder/block.h VP8_CX_SRCS-yes += encoder/boolhuff.h @@ -56,11 +57,14 @@ VP8_CX_SRCS-yes += encoder/modecosts.c VP8_CX_SRCS-yes += encoder/onyx_if.c VP8_CX_SRCS-yes += encoder/pickinter.c VP8_CX_SRCS-yes += encoder/picklpf.c +VP8_CX_SRCS-yes += encoder/picklpf.h VP8_CX_SRCS-yes += encoder/vp8_quantize.c VP8_CX_SRCS-yes += encoder/ratectrl.c VP8_CX_SRCS-yes += encoder/rdopt.c VP8_CX_SRCS-yes += encoder/segmentation.c VP8_CX_SRCS-yes += encoder/segmentation.h +VP8_CX_SRCS-yes += common/vp8_skin_detection.c +VP8_CX_SRCS-yes += common/vp8_skin_detection.h VP8_CX_SRCS-yes += encoder/tokenize.c VP8_CX_SRCS-yes += encoder/dct_value_cost.h VP8_CX_SRCS-yes += encoder/dct_value_tokens.h @@ -68,20 +72,20 @@ VP8_CX_SRCS-yes += encoder/treewriter.c VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c VP8_CX_SRCS-yes += encoder/temporal_filter.c +VP8_CX_SRCS-yes += encoder/temporal_filter.h VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c +VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h endif -VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm -VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c -VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp8_quantize_ssse3.c VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) @@ -90,7 +94,6 @@ endif VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c -VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) @@ -107,6 +110,9 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c + ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c endif diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index eca655e728..025254c3f3 100644 --- a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -14,14 +14,7 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" - -static int16_t sinpi_1_9 = 0x14a3; -static int16_t sinpi_2_9 = 0x26c9; -static int16_t sinpi_3_9 = 0x3441; -static int16_t sinpi_4_9 = 0x3b6c; -static int16_t cospi_8_64 = 0x3b21; -static int16_t cospi_16_64 = 0x2d41; -static int16_t cospi_24_64 = 0x187e; +#include "vpx_dsp/txfm_common.h" static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) { int32x4_t q8s32, q9s32; @@ -37,7 +30,6 @@ static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) { *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]); *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]); - return; } static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16, @@ -45,7 +37,6 @@ static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16, *d0s16 = vdup_n_s16(cospi_8_64); *d1s16 = vdup_n_s16(cospi_16_64); *d2s16 = vdup_n_s16(cospi_24_64); - return; } static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16, @@ -54,7 +45,6 @@ static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16, *d4s16 = vdup_n_s16(sinpi_2_9); *q3s16 = vdupq_n_s16(sinpi_3_9); *d5s16 = vdup_n_s16(sinpi_4_9); - return; } static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16, @@ -80,17 +70,16 @@ static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16, q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q10s32, 14); + d26s16 = vrshrn_n_s32(q13s32, 14); + d27s16 = vrshrn_n_s32(q14s32, 14); + d29s16 = vrshrn_n_s32(q15s32, 14); + d28s16 = vrshrn_n_s32(q10s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); *q8s16 = vaddq_s16(q13s16, q14s16); *q9s16 = vsubq_s16(q13s16, q14s16); *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16)); // vswp - return; } static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, @@ -129,18 +118,17 @@ static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, q14s32 = vaddq_s32(q11s32, q12s32); q10s32 = vsubq_s32(q10s32, q12s32); - d16s16 = vqrshrn_n_s32(q13s32, 14); - d17s16 = vqrshrn_n_s32(q14s32, 14); - d18s16 = vqrshrn_n_s32(q15s32, 14); - d19s16 = vqrshrn_n_s32(q10s32, 14); + d16s16 = vrshrn_n_s32(q13s32, 14); + d17s16 = vrshrn_n_s32(q14s32, 14); + d18s16 = vrshrn_n_s32(q15s32, 14); + d19s16 = vrshrn_n_s32(q10s32, 14); *q8s16 = vcombine_s16(d16s16, d17s16); *q9s16 = vcombine_s16(d18s16, d19s16); - return; } -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride, int tx_type) { +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { uint8x8_t d26u8, d27u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; uint32x2_t d26u32, d27u32; @@ -156,9 +144,8 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, switch (tx_type) { case 0: // idct_idct is not supported. Fall back to C - vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type); + vp9_iht4x4_16_add_c(input, dest, stride, tx_type); return; - break; case 1: // iadst_idct // generate constants GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); @@ -209,11 +196,11 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, q9s16 = vrshrq_n_s16(q9s16, 4); d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); - dest += dest_stride; + dest += stride; d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); - dest += dest_stride; + dest += stride; d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); - dest += dest_stride; + dest += stride; d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); @@ -223,11 +210,10 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); - dest -= dest_stride; + dest -= stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); - dest -= dest_stride; + dest -= stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); - dest -= dest_stride; + dest -= stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); - return; } diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c index 0e3febf159..1c739861c3 100644 --- a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c +++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -76,10 +76,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); + d8s16 = vrshrn_n_s32(q2s32, 14); + d9s16 = vrshrn_n_s32(q3s32, 14); + d10s16 = vrshrn_n_s32(q5s32, 14); + d11s16 = vrshrn_n_s32(q6s32, 14); q4s16 = vcombine_s16(d8s16, d9s16); q5s16 = vcombine_s16(d10s16, d11s16); @@ -93,10 +93,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q9s32 = vmlal_s16(q9s32, d22s16, d2s16); q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - d14s16 = vqrshrn_n_s32(q2s32, 14); - d15s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q13s32, 14); + d14s16 = vrshrn_n_s32(q2s32, 14); + d15s16 = vrshrn_n_s32(q3s32, 14); + d12s16 = vrshrn_n_s32(q9s32, 14); + d13s16 = vrshrn_n_s32(q13s32, 14); q6s16 = vcombine_s16(d12s16, d13s16); q7s16 = vcombine_s16(d14s16, d15s16); @@ -115,10 +115,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, d0s16 = vdup_n_s16(cospi_24_64); d1s16 = vdup_n_s16(cospi_8_64); - d18s16 = vqrshrn_n_s32(q2s32, 14); - d19s16 = vqrshrn_n_s32(q3s32, 14); - d22s16 = vqrshrn_n_s32(q13s32, 14); - d23s16 = vqrshrn_n_s32(q15s32, 14); + d18s16 = vrshrn_n_s32(q2s32, 14); + d19s16 = vrshrn_n_s32(q3s32, 14); + d22s16 = vrshrn_n_s32(q13s32, 14); + d23s16 = vrshrn_n_s32(q15s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); *q11s16 = vcombine_s16(d22s16, d23s16); @@ -132,10 +132,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q8s32 = vmlal_s16(q8s32, d28s16, d0s16); q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - d26s16 = vqrshrn_n_s32(q2s32, 14); - d27s16 = vqrshrn_n_s32(q3s32, 14); - d30s16 = vqrshrn_n_s32(q8s32, 14); - d31s16 = vqrshrn_n_s32(q12s32, 14); + d26s16 = vrshrn_n_s32(q2s32, 14); + d27s16 = vrshrn_n_s32(q3s32, 14); + d30s16 = vrshrn_n_s32(q8s32, 14); + d31s16 = vrshrn_n_s32(q12s32, 14); *q13s16 = vcombine_s16(d26s16, d27s16); *q15s16 = vcombine_s16(d30s16, d31s16); @@ -165,10 +165,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); + d10s16 = vrshrn_n_s32(q9s32, 14); + d11s16 = vrshrn_n_s32(q10s32, 14); + d12s16 = vrshrn_n_s32(q11s32, 14); + d13s16 = vrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); @@ -180,7 +180,6 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, *q13s16 = vsubq_s16(q2s16, q5s16); *q14s16 = vsubq_s16(q1s16, q6s16); *q15s16 = vsubq_s16(q0s16, q7s16); - return; } static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, @@ -243,8 +242,8 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q1s32 = vsubq_s32(q1s32, q5s32); q2s32 = vsubq_s32(q2s32, q6s32); - d22s16 = vqrshrn_n_s32(q11s32, 14); - d23s16 = vqrshrn_n_s32(q12s32, 14); + d22s16 = vrshrn_n_s32(q11s32, 14); + d23s16 = vrshrn_n_s32(q12s32, 14); *q11s16 = vcombine_s16(d22s16, d23s16); q12s32 = vaddq_s32(q3s32, q7s32); @@ -252,12 +251,12 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q3s32 = vsubq_s32(q3s32, q7s32); q4s32 = vsubq_s32(q4s32, q8s32); - d2s16 = vqrshrn_n_s32(q1s32, 14); - d3s16 = vqrshrn_n_s32(q2s32, 14); - d24s16 = vqrshrn_n_s32(q12s32, 14); - d25s16 = vqrshrn_n_s32(q15s32, 14); - d6s16 = vqrshrn_n_s32(q3s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); + d2s16 = vrshrn_n_s32(q1s32, 14); + d3s16 = vrshrn_n_s32(q2s32, 14); + d24s16 = vrshrn_n_s32(q12s32, 14); + d25s16 = vrshrn_n_s32(q15s32, 14); + d6s16 = vrshrn_n_s32(q3s32, 14); + d7s16 = vrshrn_n_s32(q4s32, 14); *q12s16 = vcombine_s16(d24s16, d25s16); d0s16 = vdup_n_s16(cospi_10_64); @@ -292,10 +291,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q2s32 = vsubq_s32(q2s32, q10s32); q6s32 = vsubq_s32(q6s32, q9s32); - d28s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d4s16 = vqrshrn_n_s32(q2s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); + d28s16 = vrshrn_n_s32(q14s32, 14); + d29s16 = vrshrn_n_s32(q15s32, 14); + d4s16 = vrshrn_n_s32(q2s32, 14); + d5s16 = vrshrn_n_s32(q6s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); q9s32 = vaddq_s32(q4s32, q0s32); @@ -306,10 +305,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, d30s16 = vdup_n_s16(cospi_8_64); d31s16 = vdup_n_s16(cospi_24_64); - d18s16 = vqrshrn_n_s32(q9s32, 14); - d19s16 = vqrshrn_n_s32(q10s32, 14); - d8s16 = vqrshrn_n_s32(q4s32, 14); - d9s16 = vqrshrn_n_s32(q5s32, 14); + d18s16 = vrshrn_n_s32(q9s32, 14); + d19s16 = vrshrn_n_s32(q10s32, 14); + d8s16 = vrshrn_n_s32(q4s32, 14); + d9s16 = vrshrn_n_s32(q5s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q5s32 = vmull_s16(d2s16, d30s16); @@ -342,10 +341,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q5s32 = vsubq_s32(q5s32, q1s32); q6s32 = vsubq_s32(q6s32, q3s32); - d18s16 = vqrshrn_n_s32(q14s32, 14); - d19s16 = vqrshrn_n_s32(q15s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); + d18s16 = vrshrn_n_s32(q14s32, 14); + d19s16 = vrshrn_n_s32(q15s32, 14); + d10s16 = vrshrn_n_s32(q5s32, 14); + d11s16 = vrshrn_n_s32(q6s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q1s32 = vaddq_s32(q7s32, q10s32); @@ -353,10 +352,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q7s32 = vsubq_s32(q7s32, q10s32); q0s32 = vsubq_s32(q0s32, q2s32); - d28s16 = vqrshrn_n_s32(q1s32, 14); - d29s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q7s32, 14); - d15s16 = vqrshrn_n_s32(q0s32, 14); + d28s16 = vrshrn_n_s32(q1s32, 14); + d29s16 = vrshrn_n_s32(q3s32, 14); + d14s16 = vrshrn_n_s32(q7s32, 14); + d15s16 = vrshrn_n_s32(q0s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); d30s16 = vdup_n_s16(cospi_16_64); @@ -375,10 +374,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); - d4s16 = vqrshrn_n_s32(q2s32, 14); - d5s16 = vqrshrn_n_s32(q3s32, 14); - d24s16 = vqrshrn_n_s32(q13s32, 14); - d25s16 = vqrshrn_n_s32(q1s32, 14); + d4s16 = vrshrn_n_s32(q2s32, 14); + d5s16 = vrshrn_n_s32(q3s32, 14); + d24s16 = vrshrn_n_s32(q13s32, 14); + d25s16 = vrshrn_n_s32(q1s32, 14); q2s16 = vcombine_s16(d4s16, d5s16); *q12s16 = vcombine_s16(d24s16, d25s16); @@ -392,10 +391,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); - d20s16 = vqrshrn_n_s32(q13s32, 14); - d21s16 = vqrshrn_n_s32(q1s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q0s32, 14); + d20s16 = vrshrn_n_s32(q13s32, 14); + d21s16 = vrshrn_n_s32(q1s32, 14); + d12s16 = vrshrn_n_s32(q11s32, 14); + d13s16 = vrshrn_n_s32(q0s32, 14); *q10s16 = vcombine_s16(d20s16, d21s16); q6s16 = vcombine_s16(d12s16, d13s16); @@ -405,11 +404,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, *q11s16 = vsubq_s16(q5s16, q2s16); *q13s16 = vsubq_s16(q5s16, q6s16); *q15s16 = vsubq_s16(q5s16, q4s16); - return; } -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride, int tx_type) { +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { int i; uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; @@ -431,9 +429,8 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, switch (tx_type) { case 0: // idct_idct is not supported. Fall back to C - vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type); + vp9_iht8x8_64_add_c(input, dest, stride, tx_type); return; - break; case 1: // iadst_idct // generate IDCT constants // GENERATE_IDCT_CONSTANTS @@ -511,13 +508,13 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, } d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; + d1 += stride; d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; + d1 += stride; d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; + d1 += stride; d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; + d1 += stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); @@ -532,13 +529,12 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; + d2 += stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; + d2 += stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; + d2 += stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; + d2 += stride; } - return; } diff --git a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c index 2d4839174d..f6b29265e6 100644 --- a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c +++ b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -21,8 +21,8 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride, int tx_type) { +void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; DECLARE_ALIGNED(32, int16_t, out[4 * 4]); int16_t *outptr = out; @@ -37,7 +37,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical vpx_idct4_rows_dspr2(input, outptr); - vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal vpx_idct4_rows_dspr2(input, outptr); @@ -48,8 +48,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, iadst4_dspr2(outptr, temp_out); for (j = 0; j < 4; ++j) - dest[j * dest_stride + i] = clip_pixel( - ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); outptr += 4; } @@ -66,7 +66,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 4 + j] = out[j * 4 + i]; } } - vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 4; ++i) { @@ -80,8 +80,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, iadst4_dspr2(temp_in, temp_out); for (j = 0; j < 4; ++j) - dest[j * dest_stride + i] = clip_pixel( - ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); } break; default: printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); break; diff --git a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index 86896f04ca..b945e307e6 100644 --- a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -20,8 +20,8 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride, int tx_type) { +void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; DECLARE_ALIGNED(32, int16_t, out[8 * 8]); int16_t *outptr = out; @@ -34,7 +34,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical idct8_rows_dspr2(input, outptr, 8); - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal idct8_rows_dspr2(input, outptr, 8); @@ -43,8 +43,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, iadst8_dspr2(&out[i * 8], temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel( - ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); } break; case DCT_ADST: // DCT in vertical, ADST in horizontal @@ -59,7 +59,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 8 + j] = out[j * 8 + i]; } } - idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&temp_in[0], dest, stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 8; ++i) { @@ -74,8 +74,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, iadst8_dspr2(temp_in, temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel( - ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); } break; default: printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); break; diff --git a/libs/libvpx/vp9/common/vp9_alloccommon.c b/libs/libvpx/vp9/common/vp9_alloccommon.c index 66aa733b9e..7345e259b6 100644 --- a/libs/libvpx/vp9/common/vp9_alloccommon.c +++ b/libs/libvpx/vp9/common/vp9_alloccommon.c @@ -17,24 +17,6 @@ #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_onyxc_int.h" -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -void lock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_lock(&pool->pool_mutex); -#else - (void)pool; -#endif -} - -void unlock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_unlock(&pool->pool_mutex); -#else - (void)pool; -#endif -} - void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); @@ -62,8 +44,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { cm->prev_seg_map_idx = 1; cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; - if (!cm->frame_parallel_decode) - cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; return 0; } @@ -77,20 +58,18 @@ static void free_seg_map(VP9_COMMON *cm) { } cm->current_frame_seg_map = NULL; - - if (!cm->frame_parallel_decode) { - cm->last_frame_seg_map = NULL; - } + cm->last_frame_seg_map = NULL; } void vp9_free_ref_frame_buffers(BufferPool *pool) { int i; for (i = 0; i < FRAME_BUFFERS; ++i) { - if (pool->frame_bufs[i].ref_count > 0 && + if (!pool->frame_bufs[i].released && pool->frame_bufs[i].raw_frame_buffer.data != NULL) { pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); pool->frame_bufs[i].ref_count = 0; + pool->frame_bufs[i].released = 1; } vpx_free(pool->frame_bufs[i].mvs); pool->frame_bufs[i].mvs = NULL; @@ -176,6 +155,9 @@ fail: } void vp9_remove_common(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + vp9_free_postproc_buffers(cm); +#endif vp9_free_context_buffers(cm); vpx_free(cm->fc); @@ -186,7 +168,7 @@ void vp9_remove_common(VP9_COMMON *cm) { void vp9_init_context_buffers(VP9_COMMON *cm) { cm->setup_mi(cm); - if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + if (cm->last_frame_seg_map) memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); } diff --git a/libs/libvpx/vp9/common/vp9_blockd.c b/libs/libvpx/vp9/common/vp9_blockd.c index b0249687fd..4327599510 100644 --- a/libs/libvpx/vp9/common/vp9_blockd.c +++ b/libs/libvpx/vp9/common/vp9_blockd.c @@ -53,8 +53,9 @@ void vp9_foreach_transformed_block_in_plane( // the current block size extends into the UMV and we won't // visit the sub blocks that are wholly within the UMV. const int max_blocks_wide = - num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> - (5 + pd->subsampling_x)); + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 diff --git a/libs/libvpx/vp9/common/vp9_debugmodes.c b/libs/libvpx/vp9/common/vp9_debugmodes.c index 7d128c9f7f..28cd4a1924 100644 --- a/libs/libvpx/vp9/common/vp9_debugmodes.c +++ b/libs/libvpx/vp9/common/vp9_debugmodes.c @@ -34,7 +34,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, for (mi_row = 0; mi_row < rows; mi_row++) { fprintf(file, "%c ", prefix); for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(file, "%2d ", *((int *)((char *)(mi[0]) + member_offset))); + fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset))); mi++; } fprintf(file, "\n"); diff --git a/libs/libvpx/vp9/common/vp9_entropymode.c b/libs/libvpx/vp9/common/vp9_entropymode.c index 22365efc50..47cd63e94f 100644 --- a/libs/libvpx/vp9/common/vp9_entropymode.c +++ b/libs/libvpx/vp9/common/vp9_entropymode.c @@ -179,29 +179,29 @@ static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = { { 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm }; -const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] - [PARTITION_TYPES - 1] = { - // 8x8 -> 4x4 - { 158, 97, 94 }, // a/l both not split - { 93, 24, 99 }, // a split, l not split - { 85, 119, 44 }, // l split, a not split - { 62, 59, 67 }, // a/l both split - // 16x16 -> 8x8 - { 149, 53, 53 }, // a/l both not split - { 94, 20, 48 }, // a split, l not split - { 83, 53, 24 }, // l split, a not split - { 52, 18, 18 }, // a/l both split - // 32x32 -> 16x16 - { 150, 40, 39 }, // a/l both not split - { 78, 12, 26 }, // a split, l not split - { 67, 33, 11 }, // l split, a not split - { 24, 7, 5 }, // a/l both split - // 64x64 -> 32x32 - { 174, 35, 49 }, // a/l both not split - { 68, 11, 27 }, // a split, l not split - { 57, 15, 9 }, // l split, a not split - { 12, 3, 3 }, // a/l both split - }; +const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = + { + // 8x8 -> 4x4 + { 158, 97, 94 }, // a/l both not split + { 93, 24, 99 }, // a split, l not split + { 85, 119, 44 }, // l split, a not split + { 62, 59, 67 }, // a/l both split + // 16x16 -> 8x8 + { 149, 53, 53 }, // a/l both not split + { 94, 20, 48 }, // a split, l not split + { 83, 53, 24 }, // l split, a not split + { 52, 18, 18 }, // a/l both split + // 32x32 -> 16x16 + { 150, 40, 39 }, // a/l both not split + { 78, 12, 26 }, // a split, l not split + { 67, 33, 11 }, // l split, a not split + { 24, 7, 5 }, // a/l both split + // 64x64 -> 32x32 + { 174, 35, 49 }, // a/l both not split + { 68, 11, 27 }, // a split, l not split + { 57, 15, 9 }, // l split, a not split + { 12, 3, 3 }, // a/l both split + }; static const vpx_prob default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = { @@ -428,7 +428,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { vp9_clearall_segfeatures(&cm->seg); cm->seg.abs_delta = SEGMENT_DELTADATA; - if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + if (cm->last_frame_seg_map) memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); if (cm->current_frame_seg_map) @@ -457,7 +457,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { } // prev_mip will only be allocated in encoder. - if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode) + if (frame_is_intra_only(cm) && cm->prev_mip) memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); diff --git a/libs/libvpx/vp9/common/vp9_entropymv.h b/libs/libvpx/vp9/common/vp9_entropymv.h index a8cc7e93a2..e2fe37a327 100644 --- a/libs/libvpx/vp9/common/vp9_entropymv.h +++ b/libs/libvpx/vp9/common/vp9_entropymv.h @@ -27,12 +27,9 @@ void vp9_init_mv_probs(struct VP9Common *cm); void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); -// Integer pel reference mv threshold for use of high-precision 1/8 mv -#define COMPANDED_MVREF_THRESH 8 - static INLINE int use_mv_hp(const MV *ref) { - return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && - (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH; + const int kMvRefThresh = 64; // threshold for use of high-precision 1/8 mv + return abs(ref->row) < kMvRefThresh && abs(ref->col) < kMvRefThresh; } #define MV_UPDATE_PROB 252 diff --git a/libs/libvpx/vp9/common/vp9_frame_buffers.c b/libs/libvpx/vp9/common/vp9_frame_buffers.c index efcf2bf885..a254e79d20 100644 --- a/libs/libvpx/vp9/common/vp9_frame_buffers.c +++ b/libs/libvpx/vp9/common/vp9_frame_buffers.c @@ -52,14 +52,12 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size, if (i == int_fb_list->num_internal_frame_buffers) return -1; if (int_fb_list->int_fb[i].size < min_size) { - int_fb_list->int_fb[i].data = - (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size); - if (!int_fb_list->int_fb[i].data) return -1; - - // This memset is needed for fixing valgrind error from C loop filter + vpx_free(int_fb_list->int_fb[i].data); + // The data must be zeroed to fix a valgrind error from the C loop filter // due to access uninitialized memory in frame border. It could be - // removed if border is totally removed. - memset(int_fb_list->int_fb[i].data, 0, min_size); + // skipped if border were totally removed. + int_fb_list->int_fb[i].data = (uint8_t *)vpx_calloc(1, min_size); + if (!int_fb_list->int_fb[i].data) return -1; int_fb_list->int_fb[i].size = min_size; } diff --git a/libs/libvpx/vp9/common/vp9_idct.c b/libs/libvpx/vp9/common/vp9_idct.c index dc2e03946d..69069042cc 100644 --- a/libs/libvpx/vp9/common/vp9_idct.c +++ b/libs/libvpx/vp9/common/vp9_idct.c @@ -156,6 +156,8 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_idct16x16_1_add(input, dest, stride); else if (eob <= 10) vpx_idct16x16_10_add(input, dest, stride); + else if (eob <= 38) + vpx_idct16x16_38_add(input, dest, stride); else vpx_idct16x16_256_add(input, dest, stride); } @@ -203,18 +205,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, #if CONFIG_VP9_HIGHBITDEPTH -// 12 signal input bits + 7 forward transform amplify bits + 1 bit -// for contingency in rounding and quantizing -#define VALID_IHT_MAGNITUDE_RANGE (1 << 20) - -static INLINE int detect_invalid_iht_input(const tran_low_t *input, int size) { - int i; - for (i = 0; i < size; ++i) - if (abs(input[i]) >= VALID_IHT_MAGNITUDE_RANGE) return 1; - return 0; -} - -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { const highbd_transform_2d IHT_4[] = { { vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0 @@ -222,20 +213,12 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, { vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2 { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3 }; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i, j; tran_low_t out[4 * 4]; tran_low_t *outptr = out; tran_low_t temp_in[4], temp_out[4]; - if (detect_invalid_iht_input(input, 16)) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(0 && "invalid highbd iht input"); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - return; - } - // Inverse transform row vectors. for (i = 0; i < 4; ++i) { IHT_4[tx_type].rows(input, outptr, bd); @@ -261,21 +244,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = { { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3 }; -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - if (detect_invalid_iht_input(input, 64)) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(0 && "invalid highbd iht input"); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - return; - } // Inverse transform row vectors. for (i = 0; i < 8; ++i) { @@ -302,21 +277,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = { { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3 }; -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - if (detect_invalid_iht_input(input, 256)) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(0 && "invalid highbd iht input"); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - return; - } // Rows for (i = 0; i < 16; ++i) { @@ -337,7 +304,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } // idct -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_idct4x4_16_add(input, dest, stride, bd); @@ -345,7 +312,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_idct4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_iwht4x4_16_add(input, dest, stride, bd); @@ -353,7 +320,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_iwht4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -363,14 +330,14 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, // DC only DCT coefficient if (eob == 1) { vpx_highbd_idct8x8_1_add(input, dest, stride, bd); - } else if (eob <= 10) { - vpx_highbd_idct8x8_10_add(input, dest, stride, bd); + } else if (eob <= 12) { + vpx_highbd_idct8x8_12_add(input, dest, stride, bd); } else { vpx_highbd_idct8x8_64_add(input, dest, stride, bd); } } -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // The calculation can be simplified if there are not many non-zero dct // coefficients. Use eobs to separate different cases. @@ -379,18 +346,22 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, vpx_highbd_idct16x16_1_add(input, dest, stride, bd); } else if (eob <= 10) { vpx_highbd_idct16x16_10_add(input, dest, stride, bd); + } else if (eob <= 38) { + vpx_highbd_idct16x16_38_add(input, dest, stride, bd); } else { vpx_highbd_idct16x16_256_add(input, dest, stride, bd); } } -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // Non-zero coeff only in upper-left 8x8 if (eob == 1) { vpx_highbd_idct32x32_1_add(input, dest, stride, bd); } else if (eob <= 34) { vpx_highbd_idct32x32_34_add(input, dest, stride, bd); + } else if (eob <= 135) { + vpx_highbd_idct32x32_135_add(input, dest, stride, bd); } else { vpx_highbd_idct32x32_1024_add(input, dest, stride, bd); } @@ -398,7 +369,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, // iht void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) vp9_highbd_idct4x4_add(input, dest, stride, eob, bd); else @@ -406,7 +377,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct8x8_add(input, dest, stride, eob, bd); } else { @@ -415,7 +386,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct16x16_add(input, dest, stride, eob, bd); } else { diff --git a/libs/libvpx/vp9/common/vp9_idct.h b/libs/libvpx/vp9/common/vp9_idct.h index ea958a38c0..3e83b8402d 100644 --- a/libs/libvpx/vp9/common/vp9_idct.h +++ b/libs/libvpx/vp9/common/vp9_idct.h @@ -57,22 +57,22 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); #endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.c b/libs/libvpx/vp9/common/vp9_loopfilter.c index d796d7161c..c7c343aed5 100644 --- a/libs/libvpx/vp9/common/vp9_loopfilter.c +++ b/libs/libvpx/vp9/common/vp9_loopfilter.c @@ -465,12 +465,11 @@ static void filter_selectively_horiz( if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + vpx_lpf_horizontal_16_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); count = 2; } else { - vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr); + vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -559,12 +558,12 @@ static void highbd_filter_selectively_horiz( if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + vpx_highbd_lpf_horizontal_16_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, bd); count = 2; } else { - vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, bd); + vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { @@ -1613,12 +1612,14 @@ void vp9_loop_filter_data_reset( void vp9_reset_lfm(VP9_COMMON *const cm) { if (cm->lf.filter_level) { - memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * - cm->lf.lfm_stride * sizeof(*cm->lf.lfm)); + memset(cm->lf.lfm, 0, + ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride * + sizeof(*cm->lf.lfm)); } } -int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) { +int vp9_loop_filter_worker(void *arg1, void *unused) { + LFWorkerData *const lf_data = (LFWorkerData *)arg1; (void)unused; loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only); diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.h b/libs/libvpx/vp9/common/vp9_loopfilter.h index da37a6ebde..481a6cdc63 100644 --- a/libs/libvpx/vp9/common/vp9_loopfilter.h +++ b/libs/libvpx/vp9/common/vp9_loopfilter.h @@ -151,8 +151,8 @@ void vp9_loop_filter_data_reset( LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]); -// Operates on the rows described by 'lf_data'. -int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused); +// Operates on the rows described by 'arg1' (cast to LFWorkerData *). +int vp9_loop_filter_worker(void *arg1, void *unused); #ifdef __cplusplus } // extern "C" #endif diff --git a/libs/libvpx/vp9/common/vp9_onyxc_int.h b/libs/libvpx/vp9/common/vp9_onyxc_int.h index 32db7b7aa7..1d96d92c24 100644 --- a/libs/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libs/libvpx/vp9/common/vp9_onyxc_int.h @@ -37,13 +37,10 @@ extern "C" { #define REF_FRAMES_LOG2 3 #define REF_FRAMES (1 << REF_FRAMES_LOG2) -// 4 scratch frames for the new frames to support a maximum of 4 cores decoding -// in parallel, 3 for scaled references on the encoder. -// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number -// of framebuffers. +// 1 scratch frame for the new frame, 3 for scaled references on the encoder. // TODO(jkoleszar): These 3 extra references could probably come from the // normal reference pool. -#define FRAME_BUFFERS (REF_FRAMES + 7) +#define FRAME_BUFFERS (REF_FRAMES + 4) #define FRAME_CONTEXTS_LOG2 2 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) @@ -72,30 +69,12 @@ typedef struct { MV_REF *mvs; int mi_rows; int mi_cols; + uint8_t released; vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; - - // The Following variables will only be used in frame parallel decode. - - // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means - // that no FrameWorker owns, or is decoding, this buffer. - VPxWorker *frame_worker_owner; - - // row and col indicate which position frame has been decoded to in real - // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX - // when the frame is fully decoded. - int row; - int col; } RefCntBuffer; typedef struct BufferPool { -// Protect BufferPool from being accessed by several FrameWorkers at -// the same time during frame parallel decode. -// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. -#if CONFIG_MULTITHREAD - pthread_mutex_t pool_mutex; -#endif - // Private data associated with the frame buffer callbacks. void *cb_priv; @@ -235,10 +214,6 @@ typedef struct VP9Common { struct loopfilter lf; struct segmentation seg; - // TODO(hkuang): Remove this as it is the same as frame_parallel_decode - // in pbi. - int frame_parallel_decode; // frame-based threading. - // Context probabilities for reference frame prediction MV_REFERENCE_FRAME comp_fixed_ref; MV_REFERENCE_FRAME comp_var_ref[2]; @@ -283,11 +258,6 @@ typedef struct VP9Common { int above_context_alloc_cols; } VP9_COMMON; -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -void lock_buffer_pool(BufferPool *const pool); -void unlock_buffer_pool(BufferPool *const pool); - static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { if (index < 0 || index >= REF_FRAMES) return NULL; if (cm->ref_frame_map[index] < 0) return NULL; @@ -303,7 +273,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; int i; - lock_buffer_pool(cm->buffer_pool); for (i = 0; i < FRAME_BUFFERS; ++i) if (frame_bufs[i].ref_count == 0) break; @@ -314,7 +283,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { i = INVALID_IDX; } - unlock_buffer_pool(cm->buffer_pool); return i; } @@ -342,7 +310,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm, xd->partition_probs = frame_is_intra_only(cm) ? &vp9_kf_partition_probs[0] - : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob; + : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob; } static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd, diff --git a/libs/libvpx/vp9/common/vp9_postproc.c b/libs/libvpx/vp9/common/vp9_postproc.c index b105e5d45a..dfc315eeac 100644 --- a/libs/libvpx/vp9/common/vp9_postproc.c +++ b/libs/libvpx/vp9/common/vp9_postproc.c @@ -380,7 +380,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, // if mfqe is enabled. Need to take both the quality and the speed // into consideration. if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) { - vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); + vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); } if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) { deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf, @@ -390,7 +390,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q, cm->postproc_state.limits); } else { - vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); + vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); } } else if (flags & VP9D_DEMACROBLOCK) { deblock_and_de_macro_block(cm->frame_to_show, ppbuf, @@ -399,7 +399,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, } else if (flags & VP9D_DEBLOCK) { vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits); } else { - vp8_yv12_copy_frame(cm->frame_to_show, ppbuf); + vpx_yv12_copy_frame(cm->frame_to_show, ppbuf); } ppstate->last_base_qindex = cm->base_qindex; diff --git a/libs/libvpx/vp9/common/vp9_ppflags.h b/libs/libvpx/vp9/common/vp9_ppflags.h index 6dcfa412be..b8b647bf18 100644 --- a/libs/libvpx/vp9/common/vp9_ppflags.h +++ b/libs/libvpx/vp9/common/vp9_ppflags.h @@ -20,14 +20,7 @@ enum { VP9D_DEBLOCK = 1 << 0, VP9D_DEMACROBLOCK = 1 << 1, VP9D_ADDNOISE = 1 << 2, - VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3, - VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4, - VP9D_DEBUG_TXT_DC_DIFF = 1 << 5, - VP9D_DEBUG_TXT_RATE_INFO = 1 << 6, - VP9D_DEBUG_DRAW_MV = 1 << 7, - VP9D_DEBUG_CLR_BLK_MODES = 1 << 8, - VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9, - VP9D_MFQE = 1 << 10 + VP9D_MFQE = 1 << 3 }; typedef struct { diff --git a/libs/libvpx/vp9/common/vp9_reconinter.c b/libs/libvpx/vp9/common/vp9_reconinter.c index 8eb7126898..a108a65153 100644 --- a/libs/libvpx/vp9/common/vp9_reconinter.c +++ b/libs/libvpx/vp9/common/vp9_reconinter.c @@ -21,7 +21,7 @@ #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd) { @@ -190,7 +190,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + highbd_inter_predictor(CONVERT_TO_SHORTPTR(pre), pre_buf->stride, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { diff --git a/libs/libvpx/vp9/common/vp9_reconinter.h b/libs/libvpx/vp9/common/vp9_reconinter.h index 4fed4f7f6e..bb9291a264 100644 --- a/libs/libvpx/vp9/common/vp9_reconinter.h +++ b/libs/libvpx/vp9/common/vp9_reconinter.h @@ -26,19 +26,19 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys) { - sf->predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h); + sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst, + dst_stride, kernel, subpel_x, + xs, subpel_y, ys, w, h); } #if CONFIG_VP9_HIGHBITDEPTH static INLINE void highbd_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const int subpel_x, const int subpel_y, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h, bd); + src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w, + h, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -68,7 +68,7 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd); diff --git a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl index 37a867323d..22b67ecace 100644 --- a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vp9_common_forward_decls() { print <frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); @@ -183,7 +184,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, VPxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; - worker->hook = (VPxWorkerHook)loop_filter_row_worker; + worker->hook = loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; diff --git a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index dcfc454aa0..6996260e26 100644 --- a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -16,11 +16,10 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[2]; - const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); switch (tx_type) { case 0: // DCT_DCT @@ -49,48 +48,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, in[0]); - d2 = _mm_add_epi16(d2, in[1]); - d0 = _mm_packus_epi16(d0, d2); - // store result[0] - *(int *)dest = _mm_cvtsi128_si32(d0); - // store result[1] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store result[2] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - // store result[3] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - } + recon_and_store4x4_sse2(in, dest, stride); } void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; - const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 1); - in[2] = load_input_data(input + 8 * 2); - in[3] = load_input_data(input + 8 * 3); - in[4] = load_input_data(input + 8 * 4); - in[5] = load_input_data(input + 8 * 5); - in[6] = load_input_data(input + 8 * 6); - in[7] = load_input_data(input + 8 * 7); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8 * 1); + in[2] = load_input_data8(input + 8 * 2); + in[3] = load_input_data8(input + 8 * 3); + in[4] = load_input_data8(input + 8 * 4); + in[5] = load_input_data8(input + 8 * 5); + in[6] = load_input_data8(input + 8 * 6); + in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { case 0: // DCT_DCT @@ -131,14 +105,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); +} + +static INLINE void load_buffer_8x16(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); + + in[8] = load_input_data8(input + 8 * 16); + in[9] = load_input_data8(input + 9 * 16); + in[10] = load_input_data8(input + 10 * 16); + in[11] = load_input_data8(input + 11 * 16); + in[12] = load_input_data8(input + 12 * 16); + in[13] = load_input_data8(input + 13 * 16); + in[14] = load_input_data8(input + 14 * 16); + in[15] = load_input_data8(input + 15 * 16); +} + +static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in, + const int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); + recon_and_store(dest + 8 * stride, in[8]); + recon_and_store(dest + 9 * stride, in[9]); + recon_and_store(dest + 10 * stride, in[10]); + recon_and_store(dest + 11 * stride, in[11]); + recon_and_store(dest + 12 * stride, in[12]); + recon_and_store(dest + 13 * stride, in[13]); + recon_and_store(dest + 14 * stride, in[14]); + recon_and_store(dest + 15 * stride, in[15]); } void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, diff --git a/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm index 30852049b4..ca0897ab9f 100644 --- a/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm +++ b/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm @@ -12,6 +12,8 @@ ; TODO(jackychen): Find a way to fix the duplicate. %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp9_filter_by_weight16x16_sse2 ;( ; unsigned char *src, diff --git a/libs/libvpx/vp9/decoder/vp9_decodeframe.c b/libs/libvpx/vp9/decoder/vp9_decodeframe.c index fde0b7e318..d0e896c13f 100644 --- a/libs/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/libs/libvpx/vp9/decoder/vp9_decodeframe.c @@ -189,21 +189,22 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -256,21 +257,22 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_8X8: - vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_16X16: - vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -451,24 +453,19 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, const struct scale_factors *sf, MACROBLOCKD *xd, int w, int h, int ref, int xs, int ys) { DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]); - const uint8_t *buf_ptr; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, x0, y0, b_w, b_h, frame_width, frame_height); - buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset; + highbd_inter_predictor(mc_buf_high + border_offset, b_w, + CONVERT_TO_SHORTPTR(dst), dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, x0, y0, b_w, b_h, frame_width, frame_height); - buf_ptr = ((uint8_t *)mc_buf_high) + border_offset; - } - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, - subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); - } else { - inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, subpel_y, sf, - w, h, ref, kernel, xs, ys); + inter_predictor(((uint8_t *)mc_buf_high) + border_offset, b_w, dst, + dst_buf_stride, subpel_x, subpel_y, sf, w, h, ref, kernel, + xs, ys); } } #else @@ -493,8 +490,8 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, #endif // CONFIG_VP9_HIGHBITDEPTH static void dec_build_inter_predictors( - VPxWorker *const worker, MACROBLOCKD *xd, int plane, int bw, int bh, int x, - int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel, + MACROBLOCKD *xd, int plane, int bw, int bh, int x, int y, int w, int h, + int mi_x, int mi_y, const InterpKernel *kernel, const struct scale_factors *sf, struct buf_2d *pre_buf, struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf, int is_scaled, int ref) { @@ -596,12 +593,6 @@ static void dec_build_inter_predictors( y_pad = 1; } - // Wait until reference block is ready. Pad 7 more pixels as last 7 - // pixels of each superblock row can be changed by next superblock row. - if (worker != NULL) - vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7)) - << (plane == 0 ? 0 : 1)); - // Skip border extension if block is inside the frame. if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { @@ -620,18 +611,11 @@ static void dec_build_inter_predictors( w, h, ref, xs, ys); return; } - } else { - // Wait until reference block is ready. Pad 7 more pixels as last 7 - // pixels of each superblock row can be changed by next superblock row. - if (worker != NULL) { - const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS; - vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7)) - << (plane == 0 ? 0 : 1)); - } } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + highbd_inter_predictor(CONVERT_TO_SHORTPTR(buf_ptr), buf_stride, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, @@ -655,8 +639,6 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, const int is_compound = has_second_ref(mi); int ref; int is_scaled; - VPxWorker *const fwo = - pbi->frame_parallel_decode ? pbi->frame_worker_owner : NULL; for (ref = 0; ref < 1 + is_compound; ++ref) { const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; @@ -688,10 +670,10 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, for (y = 0; y < num_4x4_h; ++y) { for (x = 0; x < num_4x4_w; ++x) { const MV mv = average_split_mvs(pd, mi, ref, i++); - dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 4 * x, - 4 * y, 4, 4, mi_x, mi_y, kernel, sf, - pre_buf, dst_buf, &mv, ref_frame_buf, - is_scaled, ref); + dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 4 * x, 4 * y, + 4, 4, mi_x, mi_y, kernel, sf, pre_buf, + dst_buf, &mv, ref_frame_buf, is_scaled, + ref); } } } @@ -705,7 +687,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, const int n4w_x4 = 4 * num_4x4_w; const int n4h_x4 = 4 * num_4x4_h; struct buf_2d *const pre_buf = &pd->pre[ref]; - dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4, + dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel, sf, pre_buf, dst_buf, &mv, ref_frame_buf, is_scaled, ref); } @@ -838,8 +820,9 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); const int max_blocks_high = num_4x4_h + - (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> - (5 + pd->subsampling_y)); + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; @@ -1188,7 +1171,6 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { resize_context_buffers(cm, width, height); setup_render_size(cm, rb); - lock_buffer_pool(pool); if (vpx_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -1198,12 +1180,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } - unlock_buffer_pool(pool); + pool->frame_bufs[cm->new_fb_idx].released = 0; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; @@ -1274,7 +1255,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, resize_context_buffers(cm, width, height); setup_render_size(cm, rb); - lock_buffer_pool(pool); if (vpx_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -1284,12 +1264,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } - unlock_buffer_pool(pool); + pool->frame_bufs[cm->new_fb_idx].released = 0; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; @@ -1385,7 +1364,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, pbi->lf_worker.data1 == NULL) { CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_memalign(32, sizeof(LFWorkerData))); - pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker; + pbi->lf_worker.hook = vp9_loop_filter_worker; if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Loop filter thread creation failed"); @@ -1474,11 +1453,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, winterface->execute(&pbi->lf_worker); } } - // After loopfiltering, the last 7 row pixels in each superblock row may - // still be changed by the longest loopfilter of the next superblock - // row. - if (pbi->frame_parallel_decode) - vp9_frameworker_broadcast(pbi->cur_buf, mi_row << MI_BLOCK_SIZE_LOG2); } } @@ -1494,16 +1468,16 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, // Get last tile data. tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1; - if (pbi->frame_parallel_decode) - vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX); return vpx_reader_find_end(&tile_data->bit_reader); } // On entry 'tile_data->data_end' points to the end of the input frame, on exit // it is updated to reflect the bitreader position of the final tile column if // present in the tile buffer group or NULL otherwise. -static int tile_worker_hook(TileWorkerData *const tile_data, - VP9Decoder *const pbi) { +static int tile_worker_hook(void *arg1, void *arg2) { + TileWorkerData *const tile_data = (TileWorkerData *)arg1; + VP9Decoder *const pbi = (VP9Decoder *)arg2; + TileInfo *volatile tile = &tile_data->xd.tile; const int final_col = (1 << pbi->common.log2_tile_cols) - 1; const uint8_t *volatile bit_reader_end = NULL; @@ -1517,7 +1491,6 @@ static int tile_worker_hook(TileWorkerData *const tile_data, return 0; } - tile_data->xd.error_info = &tile_data->error_info; tile_data->xd.corrupted = 0; do { @@ -1529,6 +1502,8 @@ static int tile_worker_hook(TileWorkerData *const tile_data, &tile_data->error_info, &tile_data->bit_reader, pbi->decrypt_cb, pbi->decrypt_state); vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff); + // init resets xd.error_info + tile_data->xd.error_info = &tile_data->error_info; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { @@ -1596,7 +1571,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, tile_data->xd = pbi->mb; tile_data->xd.counts = cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; - worker->hook = (VPxWorkerHook)tile_worker_hook; + worker->hook = tile_worker_hook; worker->data1 = tile_data; worker->data2 = pbi; } @@ -1779,24 +1754,17 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, if (cm->show_existing_frame) { // Show an existing frame directly. const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)]; - lock_buffer_pool(pool); if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Buffer %d does not contain a decoded frame", frame_to_show); } ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); - unlock_buffer_pool(pool); pbi->refresh_frame_flags = 0; cm->lf.filter_level = 0; cm->show_frame = 1; - if (pbi->frame_parallel_decode) { - for (i = 0; i < REF_FRAMES; ++i) - cm->next_ref_frame_map[i] = cm->ref_frame_map[i]; - } return 0; } @@ -1913,7 +1881,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2); // Generate next_ref_frame_map. - lock_buffer_pool(pool); for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { if (mask & 1) { cm->next_ref_frame_map[ref_index] = cm->new_fb_idx; @@ -1933,7 +1900,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, if (cm->ref_frame_map[ref_index] >= 0) ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; } - unlock_buffer_pool(pool); pbi->hold_ref_buf = 1; if (frame_is_intra_only(cm) || cm->error_resilient_mode) @@ -2090,24 +2056,6 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } - // If encoded in frame parallel mode, frame context is ready after decoding - // the frame header. - if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) { - VPxWorker *const worker = pbi->frame_worker_owner; - FrameWorkerData *const frame_worker_data = worker->data1; - if (cm->refresh_frame_context) { - context_updated = 1; - cm->frame_contexts[cm->frame_context_idx] = *cm->fc; - } - vp9_frameworker_lock_stats(worker); - pbi->cur_buf->row = -1; - pbi->cur_buf->col = -1; - frame_worker_data->frame_context_ready = 1; - // Signal the main thread that context is ready. - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - } - if (pbi->tile_worker_data == NULL || (tile_cols * tile_rows) != pbi->total_tiles) { const int num_tile_workers = diff --git a/libs/libvpx/vp9/decoder/vp9_decodemv.c b/libs/libvpx/vp9/decoder/vp9_decodemv.c index 4372ba0371..0a781413b1 100644 --- a/libs/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libs/libvpx/vp9/decoder/vp9_decodemv.c @@ -455,12 +455,6 @@ static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv, } } -static void fpm_sync(void *const data, int mi_row) { - VP9Decoder *const pbi = (VP9Decoder *)data; - vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame, - mi_row << MI_BLOCK_SIZE_LOG2); -} - // This macro is used to add a motion vector mv_ref list if it isn't // already in the list. If it's the second motion vector or early_break // it will also skip all additional processing and jump to Done! @@ -500,8 +494,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, const POSITION *const mv_ref_search, int_mv *mv_ref_list, int mi_row, int mi_col, - int block, int is_sub8x8, find_mv_refs_sync sync, - void *const data) { + int block, int is_sub8x8) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; int different_ref_found = 0; @@ -557,23 +550,8 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, } } -// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast -// on windows platform. The sync here is unnecessary if use_prev_frame_mvs -// is 0. But after removing it, there will be hang in the unit test on windows -// due to several threads waiting for a thread's signal. -#if defined(_WIN32) && !HAVE_PTHREAD_H - if (cm->frame_parallel_decode && sync != NULL) { - sync(data, mi_row); - } -#endif - // Check the last frame's mode and mv info. if (prev_frame_mvs) { - // Synchronize here for frame parallel decode if sync function is provided. - if (cm->frame_parallel_decode && sync != NULL) { - sync(data, mi_row); - } - if (prev_frame_mvs->ref_frame[0] == ref_frame) { ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done); } else if (prev_frame_mvs->ref_frame[1] == ref_frame) { @@ -652,7 +630,7 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, refmv_count = dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, - mv_list, mi_row, mi_col, block, 1, NULL, NULL); + mv_list, mi_row, mi_col, block, 1); switch (block) { case 0: best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; break; @@ -750,9 +728,8 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; int refmv_count; - refmv_count = - dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs, - mi_row, mi_col, -1, 0, fpm_sync, (void *)pbi); + refmv_count = dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, + tmp_mvs, mi_row, mi_col, -1, 0); dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref], refmv_count); @@ -770,6 +747,10 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, int idx, idy; PREDICTION_MODE b_mode; int_mv best_sub8x8[2]; + const uint32_t invalid_mv = 0x80008000; + // Initialize the 2nd element as even though it won't be used meaningfully + // if is_compound is false, copying/clamping it may trigger a MSan warning. + best_sub8x8[1].as_int = invalid_mv; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { const int j = idy * 2 + idx; diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.c b/libs/libvpx/vp9/decoder/vp9_decoder.c index 37693f0944..a913fa560c 100644 --- a/libs/libvpx/vp9/decoder/vp9_decoder.c +++ b/libs/libvpx/vp9/decoder/vp9_decoder.c @@ -139,6 +139,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) { vp9_loop_filter_dealloc(&pbi->lf_row_sync); } + vp9_remove_common(&pbi->common); vpx_free(pbi); } @@ -169,7 +170,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi, vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Incorrect buffer dimensions"); else - vp8_yv12_copy_frame(cfg, sd); + vpx_yv12_copy_frame(cfg, sd); } else { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame"); } @@ -217,7 +218,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, "Incorrect buffer dimensions"); } else { // Overwrite the reference frame buffer. - vp8_yv12_copy_frame(sd, ref_buf); + vpx_yv12_copy_frame(sd, ref_buf); } return cm->error.error_code; @@ -230,7 +231,6 @@ static void swap_frame_buffers(VP9Decoder *pbi) { BufferPool *const pool = cm->buffer_pool; RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - lock_buffer_pool(pool); for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { const int old_idx = cm->ref_frame_map[ref_index]; // Current thread releases the holding of reference frame. @@ -250,15 +250,10 @@ static void swap_frame_buffers(VP9Decoder *pbi) { decrease_ref_count(old_idx, frame_bufs, pool); cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; } - unlock_buffer_pool(pool); pbi->hold_ref_buf = 0; cm->frame_to_show = get_frame_new_buffer(cm); - if (!pbi->frame_parallel_decode || !cm->show_frame) { - lock_buffer_pool(pool); - --frame_bufs[cm->new_fb_idx].ref_count; - unlock_buffer_pool(pool); - } + --frame_bufs[cm->new_fb_idx].ref_count; // Invalidate these references until the next frame starts. for (ref_index = 0; ref_index < 3; ref_index++) @@ -292,11 +287,13 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, pbi->ready_for_new_data = 0; // Check if the previous frame was a frame without any references to it. - // Release frame buffer if not decoding in frame parallel mode. - if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 && - frame_bufs[cm->new_fb_idx].ref_count == 0) + if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 && + !frame_bufs[cm->new_fb_idx].released) { pool->release_fb_cb(pool->cb_priv, &frame_bufs[cm->new_fb_idx].raw_frame_buffer); + frame_bufs[cm->new_fb_idx].released = 1; + } + // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); if (cm->new_fb_idx == INVALID_IDX) { @@ -309,18 +306,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; pbi->hold_ref_buf = 0; - if (pbi->frame_parallel_decode) { - VPxWorker *const worker = pbi->frame_worker_owner; - vp9_frameworker_lock_stats(worker); - frame_bufs[cm->new_fb_idx].frame_worker_owner = worker; - // Reset decoding progress. - pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; - pbi->cur_buf->row = -1; - pbi->cur_buf->col = -1; - vp9_frameworker_unlock_stats(worker); - } else { - pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; - } + pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; if (setjmp(cm->error.jmp)) { const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); @@ -336,7 +322,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, winterface->sync(&pbi->tile_workers[i]); } - lock_buffer_pool(pool); // Release all the reference buffers if worker thread is holding them. if (pbi->hold_ref_buf == 1) { int ref_index = 0, mask; @@ -361,7 +346,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, } // Release current frame. decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); - unlock_buffer_pool(pool); vpx_clear_system_state(); return -1; @@ -377,31 +361,14 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, if (!cm->show_existing_frame) { cm->last_show_frame = cm->show_frame; cm->prev_frame = cm->cur_frame; - if (cm->seg.enabled && !pbi->frame_parallel_decode) - vp9_swap_current_and_last_seg_map(cm); + if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm); } // Update progress in frame parallel decode. - if (pbi->frame_parallel_decode) { - // Need to lock the mutex here as another thread may - // be accessing this buffer. - VPxWorker *const worker = pbi->frame_worker_owner; - FrameWorkerData *const frame_worker_data = worker->data1; - vp9_frameworker_lock_stats(worker); - - if (cm->show_frame) { - cm->current_video_frame++; - } - frame_worker_data->frame_decoded = 1; - frame_worker_data->frame_context_ready = 1; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - } else { - cm->last_width = cm->width; - cm->last_height = cm->height; - if (cm->show_frame) { - cm->current_video_frame++; - } + cm->last_width = cm->width; + cm->last_height = cm->height; + if (cm->show_frame) { + cm->current_video_frame++; } cm->error.setjmp = 0; diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.h b/libs/libvpx/vp9/decoder/vp9_decoder.h index 427baf1e0b..4b26c314d3 100644 --- a/libs/libvpx/vp9/decoder/vp9_decoder.h +++ b/libs/libvpx/vp9/decoder/vp9_decoder.h @@ -21,7 +21,6 @@ #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" -#include "vp9/decoder/vp9_dthread.h" #ifdef __cplusplus extern "C" { @@ -53,13 +52,10 @@ typedef struct VP9Decoder { int refresh_frame_flags; - int frame_parallel_decode; // frame-based threading. - // TODO(hkuang): Combine this with cur_buf in macroblockd as they are // the same. RefCntBuffer *cur_buf; // Current decoding frame buffer. - VPxWorker *frame_worker_owner; // frame_worker that owns this pbi. VPxWorker lf_worker; VPxWorker *tile_workers; TileWorkerData *tile_worker_data; @@ -121,9 +117,10 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, // But the private buffer is not set up until finish decoding header. // So any error happens during decoding header, the frame_bufs will not // have valid priv buffer. - if (frame_bufs[idx].ref_count == 0 && + if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 && frame_bufs[idx].raw_frame_buffer.priv) { pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer); + frame_bufs[idx].released = 1; } } } diff --git a/libs/libvpx/vp9/decoder/vp9_detokenize.c b/libs/libvpx/vp9/decoder/vp9_detokenize.c index 7048fb1ca0..4bd016dc7d 100644 --- a/libs/libvpx/vp9/decoder/vp9_detokenize.c +++ b/libs/libvpx/vp9/decoder/vp9_detokenize.c @@ -99,9 +99,10 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, vp9_cat6_prob; const int cat6_bits = #if CONFIG_VP9_HIGHBITDEPTH - (xd->bd == VPX_BITS_12) ? 18 : (xd->bd == VPX_BITS_10) ? 16 : + (xd->bd == VPX_BITS_12) ? 18 + : (xd->bd == VPX_BITS_10) ? 16 : #endif // CONFIG_VP9_HIGHBITDEPTH - 14; + 14; // Keep value, range, and count as locals. The compiler produces better // results with the locals than using r directly. BD_VALUE value = r->value; @@ -170,7 +171,12 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, read_coeff(r, vp9_cat1_prob, 1, &value, &count, &range); } } +#if CONFIG_VP9_HIGHBITDEPTH + // val may use 18-bits + v = (int)(((int64_t)val * dqv) >> dq_shift); +#else v = (val * dqv) >> dq_shift; +#endif } else { if (read_bool(r, p[1], &value, &count, &range)) { token_cache[scan[c]] = 3; @@ -188,9 +194,8 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, } #if CONFIG_COEFFICIENT_RANGE_CHECKING #if CONFIG_VP9_HIGHBITDEPTH - dqcoeff[scan[c]] = - highbd_check_range(read_bool(r, 128, &value, &count, &range) ? -v : v), - xd->bd); + dqcoeff[scan[c]] = highbd_check_range( + read_bool(r, 128, &value, &count, &range) ? -v : v, xd->bd); #else dqcoeff[scan[c]] = check_range(read_bool(r, 128, &value, &count, &range) ? -v : v); diff --git a/libs/libvpx/vp9/decoder/vp9_dthread.c b/libs/libvpx/vp9/decoder/vp9_dthread.c deleted file mode 100644 index 52bc2a0f60..0000000000 --- a/libs/libvpx/vp9/decoder/vp9_dthread.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_reconinter.h" -#include "vp9/decoder/vp9_dthread.h" -#include "vp9/decoder/vp9_decoder.h" - -// #define DEBUG_THREAD - -// TODO(hkuang): Clean up all the #ifdef in this file. -void vp9_frameworker_lock_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - pthread_mutex_lock(&worker_data->stats_mutex); -#else - (void)worker; -#endif -} - -void vp9_frameworker_unlock_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - pthread_mutex_unlock(&worker_data->stats_mutex); -#else - (void)worker; -#endif -} - -void vp9_frameworker_signal_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - -// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper. -#if defined(_WIN32) && !HAVE_PTHREAD_H - pthread_cond_signal(&worker_data->stats_cond); -#else - pthread_cond_broadcast(&worker_data->stats_cond); -#endif - -#else - (void)worker; -#endif -} - -// This macro prevents thread_sanitizer from reporting known concurrent writes. -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define BUILDING_WITH_TSAN -#endif -#endif - -// TODO(hkuang): Remove worker parameter as it is only used in debug code. -void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, - int row) { -#if CONFIG_MULTITHREAD - if (!ref_buf) return; - -#ifndef BUILDING_WITH_TSAN - // The following line of code will get harmless tsan error but it is the key - // to get best performance. - if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return; -#endif - - { - // Find the worker thread that owns the reference frame. If the reference - // frame has been fully decoded, it may not have owner. - VPxWorker *const ref_worker = ref_buf->frame_worker_owner; - FrameWorkerData *const ref_worker_data = - (FrameWorkerData *)ref_worker->data1; - const VP9Decoder *const pbi = ref_worker_data->pbi; - -#ifdef DEBUG_THREAD - { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n", - worker_data->worker_id, worker, ref_worker_data->worker_id, - ref_buf->frame_worker_owner, row, ref_buf->row); - } -#endif - - vp9_frameworker_lock_stats(ref_worker); - while (ref_buf->row < row && pbi->cur_buf == ref_buf && - ref_buf->buf.corrupted != 1) { - pthread_cond_wait(&ref_worker_data->stats_cond, - &ref_worker_data->stats_mutex); - } - - if (ref_buf->buf.corrupted == 1) { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - vp9_frameworker_unlock_stats(ref_worker); - vpx_internal_error(&worker_data->pbi->common.error, - VPX_CODEC_CORRUPT_FRAME, - "Worker %p failed to decode frame", worker); - } - vp9_frameworker_unlock_stats(ref_worker); - } -#else - (void)worker; - (void)ref_buf; - (void)row; - (void)ref_buf; -#endif // CONFIG_MULTITHREAD -} - -void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) { -#if CONFIG_MULTITHREAD - VPxWorker *worker = buf->frame_worker_owner; - -#ifdef DEBUG_THREAD - { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id, - buf->frame_worker_owner, row); - } -#endif - - vp9_frameworker_lock_stats(worker); - buf->row = row; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); -#else - (void)buf; - (void)row; -#endif // CONFIG_MULTITHREAD -} - -void vp9_frameworker_copy_context(VPxWorker *const dst_worker, - VPxWorker *const src_worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1; - FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1; - VP9_COMMON *const src_cm = &src_worker_data->pbi->common; - VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common; - int i; - - // Wait until source frame's context is ready. - vp9_frameworker_lock_stats(src_worker); - while (!src_worker_data->frame_context_ready) { - pthread_cond_wait(&src_worker_data->stats_cond, - &src_worker_data->stats_mutex); - } - - dst_cm->last_frame_seg_map = src_cm->seg.enabled - ? src_cm->current_frame_seg_map - : src_cm->last_frame_seg_map; - dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync; - vp9_frameworker_unlock_stats(src_worker); - - dst_cm->bit_depth = src_cm->bit_depth; -#if CONFIG_VP9_HIGHBITDEPTH - dst_cm->use_highbitdepth = src_cm->use_highbitdepth; -#endif - dst_cm->prev_frame = - src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame; - dst_cm->last_width = - !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width; - dst_cm->last_height = - !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height; - dst_cm->subsampling_x = src_cm->subsampling_x; - dst_cm->subsampling_y = src_cm->subsampling_y; - dst_cm->frame_type = src_cm->frame_type; - dst_cm->last_show_frame = !src_cm->show_existing_frame - ? src_cm->show_frame - : src_cm->last_show_frame; - for (i = 0; i < REF_FRAMES; ++i) - dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i]; - - memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr, - (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh)); - dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level; - dst_cm->lf.filter_level = src_cm->lf.filter_level; - memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS); - memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); - dst_cm->seg = src_cm->seg; - memcpy(dst_cm->frame_contexts, src_cm->frame_contexts, - FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0])); -#else - (void)dst_worker; - (void)src_worker; -#endif // CONFIG_MULTITHREAD -} diff --git a/libs/libvpx/vp9/decoder/vp9_dthread.h b/libs/libvpx/vp9/decoder/vp9_dthread.h deleted file mode 100644 index fce0fe7fe3..0000000000 --- a/libs/libvpx/vp9/decoder/vp9_dthread.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_DECODER_VP9_DTHREAD_H_ -#define VP9_DECODER_VP9_DTHREAD_H_ - -#include "./vpx_config.h" -#include "vpx_util/vpx_thread.h" -#include "vpx/internal/vpx_codec_internal.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct VP9Common; -struct VP9Decoder; - -// WorkerData for the FrameWorker thread. It contains all the information of -// the worker and decode structures for decoding a frame. -typedef struct FrameWorkerData { - struct VP9Decoder *pbi; - const uint8_t *data; - const uint8_t *data_end; - size_t data_size; - void *user_priv; - int result; - int worker_id; - int received_frame; - - // scratch_buffer is used in frame parallel mode only. - // It is used to make a copy of the compressed data. - uint8_t *scratch_buffer; - size_t scratch_buffer_size; - -#if CONFIG_MULTITHREAD - pthread_mutex_t stats_mutex; - pthread_cond_t stats_cond; -#endif - - int frame_context_ready; // Current frame's context is ready to read. - int frame_decoded; // Finished decoding current frame. -} FrameWorkerData; - -void vp9_frameworker_lock_stats(VPxWorker *const worker); -void vp9_frameworker_unlock_stats(VPxWorker *const worker); -void vp9_frameworker_signal_stats(VPxWorker *const worker); - -// Wait until ref_buf has been decoded to row in real pixel unit. -// Note: worker may already finish decoding ref_buf and release it in order to -// start decoding next frame. So need to check whether worker is still decoding -// ref_buf. -void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, - int row); - -// FrameWorker broadcasts its decoding progress so other workers that are -// waiting on it can resume decoding. -void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row); - -// Copy necessary decoding context from src worker to dst worker. -void vp9_frameworker_copy_context(VPxWorker *const dst_worker, - VPxWorker *const src_worker); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_DECODER_VP9_DTHREAD_H_ diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c index afffc77178..513718e7cb 100644 --- a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -16,20 +16,20 @@ #include "vp9/common/vp9_blockd.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, - int16_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - int16_t temp_buffer[64]; + tran_low_t temp_buffer[64]; (void)coeff_ptr; vpx_fdct8x8_neon(input, temp_buffer, stride); - vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan_ptr, iscan_ptr); + vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, round_ptr, quant_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan_ptr, + iscan_ptr); } diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c new file mode 100644 index 0000000000..4152e7bb5d --- /dev/null +++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vpx_mem/vpx_mem.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); + const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210), + vget_low_s64(fedcba98_76543210)); + const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); + return sum_diff; +} + +// Denoise a 16x1 vector. +static INLINE int8x16_t denoiser_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold, + const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment, + const uint8x16_t v_delta_level_1_and_2, + const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) { + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +static INLINE int8x16_t denoiser_adjust_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t k_delta, int8x16_t v_sum_diff_total) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +// Denoise 8x8 and 8x16 blocks. +static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude, + int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_height = (4 << b_height_log2_lookup[bs]) >> 1; + + int8x16_t v_sum_diff_total = vdupq_n_s8(0); + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + v_sum_diff_total = denoiser_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], + v_level1_threshold, v_level2_threshold, v_level3_threshold, + v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + v_sum_diff_total = denoiser_adjust_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = + vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = + vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + return FILTER_BLOCK; +} + +// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks. +static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_width = (4 << b_width_log2_lookup[bs]); + const int b_height = (4 << b_height_log2_lookup[bs]); + const int b_width_shift4 = b_width >> 4; + + int8x16_t v_sum_diff_total[4][4]; + int r, c, sum_diff = 0; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r] = vdupq_n_s8(0); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon( + sig, mc_running_avg_y, running_avg_y, v_level1_threshold, + v_level2_threshold, v_level3_threshold, v_level1_adjustment, + v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vdupq_n_u8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = + denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y, + k_delta, v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } + return COPY_BLOCK; +} diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c new file mode 100644 index 0000000000..e46f789bac --- /dev/null +++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c @@ -0,0 +1,843 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_scale/yv12config.h" + +// Note: The scaling functions could write extra rows and columns in dst, which +// exceed the right and bottom boundaries of the destination frame. We rely on +// the following frame extension function to fix these rows and columns. + +static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x2_t s = vld2q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x4_t s = vld4q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_bilinear_kernel( + const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2, + const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1, + uint8_t *const dst) { + const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0); + const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0); + const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0); + const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0); + const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1); + const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1); + const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1); + const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1); + + const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07 + const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F + const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17 + const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F + const uint16x8_t v0 = vmull_u8(hor0, coef0); + const uint16x8_t v1 = vmull_u8(hor1, coef0); + const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1); + const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1); + // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F + const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7)); + vst1q_u8(dst, d); +} + +static INLINE void scale_plane_2_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E + // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F + const uint8x16x2_t s0 = vld2q_u8(src0); + // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E + // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F + const uint8x16x2_t s1 = vld2q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 32; + src1 += 32; + dst += 16; + x -= 16; + } while (x); + src0 += 2 * (src_stride - max_width); + src1 += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // (*) -- useless + // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C + // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D + // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*) + // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*) + const uint8x16x4_t s0 = vld4q_u8(src0); + // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C + // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D + // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*) + // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*) + const uint8x16x4_t s1 = vld4q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 64; + src1 += 64; + dst += 16; + x -= 16; + } while (x); + src0 += 4 * (src_stride - max_width); + src1 += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s, + const uint8x8_t *const coef) { + const uint16x8_t h0 = vmull_u8(s[0], coef[0]); + const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]); + + return vrshrn_n_u16(h1, 7); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[14], d[4]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + // Note: processing 4x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71 + d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72 + d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73 + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]); + vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]), + 0); + vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]), + 0); + vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]), + 0); + vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]), + 0); + vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]), + 1); + vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]), + 1); + vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]), + 1); + vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]), + 1); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + t += 4; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 6 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17 + d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27 + d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[12], d[2]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + // Note: processing 2x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]); + x = width_hor; + + do { + uint8x8x2_t dd; + src += 8; + load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10], + &s[11]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71 + // dd.val[0]: 00 01 20 21 40 41 60 61 + // dd.val[1]: 10 11 30 31 50 51 70 71 + dd = vtrn_u8(d[0], d[1]); + vst1_lane_u16((uint16_t *)(t + 0 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 0); + vst1_lane_u16((uint16_t *)(t + 1 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 0); + vst1_lane_u16((uint16_t *)(t + 2 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 1); + vst1_lane_u16((uint16_t *)(t + 3 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 1); + vst1_lane_u16((uint16_t *)(t + 4 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 2); + vst1_lane_u16((uint16_t *)(t + 5 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 2); + vst1_lane_u16((uint16_t *)(t + 6 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 3); + vst1_lane_u16((uint16_t *)(t + 7 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 3); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + t += 2; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]); + t += 4 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +// Notes for 4 to 3 scaling: +// +// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be +// multiple of 6, and no less than w. +// +// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be +// multiple of 8, and no less than w. +// +// 3. 8 columns are calculated in each horizontal inner loop for further +// vertical scaling, so height_hor must be multiple of 8, and no less than +// 4 * h / 3. +// +// 4. 6 columns are calculated in each vertical inner loop, so height_ver must +// be multiple of 6, and no less than h. +// +// 5. The physical location of the last row of the 4 to 3 scaled frame is +// decided by phase_scaler, and are always less than 1 pixel below the last row +// of the original image. + +static void scale_plane_4_to_3_bilinear(const uint8_t *src, + const int src_stride, uint8_t *dst, + const int dst_stride, const int w, + const int h, const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We only need 1 extra row below because there are only 2 bilinear + // coefficients. + const int height_hor = (4 * h / 3 + 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[9], d[8], c[6]; + + assert(w && h); + + c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]); + c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]); + c[2] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) & + SUBPEL_MASK][3]); + c[3] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) & + SUBPEL_MASK][4]); + c[4] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) & + SUBPEL_MASK][3]); + c[5] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) & + SUBPEL_MASK][4]); + + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + src += 1; + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + src += 8; + transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3 - 1; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + t += 8 * stride_hor; + + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 1); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + const int16x8_t filters0 = + vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters1 = + vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters2 = + vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[15], d[8]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2; + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13], + &s[14]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 7 * stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + t += 8 * stride_hor; + + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 7); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, + int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const int dst_uv_w = dst_w / 2; + const int dst_uv_h = dst_h / 2; + int scaled = 0; + + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + // 2 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0, c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + } else { + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0, c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + } else { + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scaled = 1; + if (filter_type == BILINEAR) { + scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, phase_scaler, + temp_buffer); + scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, dst_uv_w, + dst_uv_h, phase_scaler, temp_buffer); + scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, dst_uv_w, + dst_uv_h, phase_scaler, temp_buffer); + } else { + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], + phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], + phase_scaler, temp_buffer); + } + free(temp_buffer); + } + } + + if (scaled) { + vpx_extend_frame_borders(dst); + } else { + // Call c version for all other scaling ratios. + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); + } +} diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c index 33c2fc7feb..97a09bdff6 100644 --- a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -9,9 +9,10 @@ */ #include - +#include #include +#include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_quant_common.h" @@ -21,97 +22,216 @@ #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" -void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" + +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - // TODO(jingning) Decide the need of these arguments after the - // quantization process is completed. - (void)zbin_ptr; - (void)quant_shift_ptr; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + int i; + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + (void)scan; + (void)skip_block; + assert(!skip_block); - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - int i; - const int16x8_t v_zero = vdupq_n_s16(0); - const int16x8_t v_one = vdupq_n_s16(1); - int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); - int16x8_t v_round = vmovq_n_s16(round_ptr[1]); - int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); - int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); - // adjust for dc - v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); - v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); - v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); - // process dc and the first seven ac coeffs - { - const int16x8_t v_iscan = vld1q_s16(&iscan[0]); - const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - vst1q_s16(&qcoeff_ptr[0], v_qcoeff); - vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff); - v_round = vmovq_n_s16(round_ptr[1]); - v_quant = vmovq_n_s16(quant_ptr[1]); - v_dequant = vmovq_n_s16(dequant_ptr[1]); - } - // now process the rest of the ac coeffs - for (i = 8; i < count; i += 8) { - const int16x8_t v_iscan = vld1q_s16(&iscan[i]); - const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - vst1q_s16(&qcoeff_ptr[i], v_qcoeff); - vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff); - } - { - const int16x4_t v_eobmax_3210 = vmax_s16( - vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); - const int64x1_t v_eobmax_xx32 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); - const int16x4_t v_eobmax_tmp = - vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); - const int64x1_t v_eobmax_xxx3 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); - const int16x4_t v_eobmax_final = - vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + // process dc and the first seven ac coeffs + { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + // now process the rest of the ac coeffs + for (i = 8; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&iscan[i]); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); + } + { + const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), + vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); - *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); - } - } else { - memset(qcoeff_ptr, 0, count * sizeof(int16_t)); - memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); - *eob_ptr = 0; + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } +} + +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan_ptr) { + const int16x8_t one = vdupq_n_s16(1); + const int16x8_t neg_one = vdupq_n_s16(-1); + + // ROUND_POWER_OF_TWO(round_ptr[], 1) + const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + const int16x8_t quant = vld1q_s16(quant_ptr); + const int16x4_t dequant = vld1_s16(dequant_ptr); + // dequant >> 2 is used similar to zbin as a threshold. + const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); + + // Process dc and the first seven ac coeffs. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + const int16x8_t dequant_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); + + int16x8_t qcoeff = vaddq_s16(coeff_abs, round); + int32x4_t dqcoeff_0, dqcoeff_1; + int16x8_t dqcoeff; + uint16x8_t eob_max; + (void)scan; + (void)count; + (void)skip_block; + assert(!skip_block); + + // coeff * quant_ptr[]) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant); + + // Restore sign. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + qcoeff = vandq_s16(qcoeff, dequant_mask); + + // qcoeff * dequant[] / 2 + dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant); + dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); + + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); + + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + + iscan_ptr += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + + { + int i; + const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1); + const int16x8_t quant = vmovq_n_s16(quant_ptr[1]); + const int16x8_t dequant_thresh = + vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2); + + // Process the rest of the ac coeffs. + for (i = 8; i < 32 * 32; i += 8) { + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + const int16x8_t dequant_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); + + int16x8_t qcoeff = vaddq_s16(coeff_abs, round); + int32x4_t dqcoeff_0, dqcoeff_1; + int16x8_t dqcoeff; + + qcoeff = vqdmulhq_s16(qcoeff, quant); + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + qcoeff = vandq_s16(qcoeff, dequant_mask); + + dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]); + dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); + + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); + + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + + iscan_ptr += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } } } diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c deleted file mode 100644 index 23f7ebace4..0000000000 --- a/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vpx_dsp/mips/macros_msa.h" - -static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride, - uint8_t *frm2_ptr, int32_t filt_sth, - int32_t filt_wgt, uint32_t *acc, - uint16_t *cnt) { - uint32_t row; - uint64_t f0, f1, f2, f3; - v16i8 frm2, frm1 = { 0 }; - v16i8 frm4, frm3 = { 0 }; - v16u8 frm_r, frm_l; - v8i16 frm2_r, frm2_l; - v8i16 diff0, diff1, mod0_h, mod1_h; - v4i32 cnst3, cnst16, filt_wt, strength; - v4i32 mod0_w, mod1_w, mod2_w, mod3_w; - v4i32 diff0_r, diff0_l, diff1_r, diff1_l; - v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; - v4i32 acc0, acc1, acc2, acc3; - v8i16 cnt0, cnt1; - - filt_wt = __msa_fill_w(filt_wgt); - strength = __msa_fill_w(filt_sth); - cnst3 = __msa_ldi_w(3); - cnst16 = __msa_ldi_w(16); - - for (row = 2; row--;) { - LD4(frm1_ptr, stride, f0, f1, f2, f3); - frm1_ptr += (4 * stride); - - LD_SB2(frm2_ptr, 16, frm2, frm4); - frm2_ptr += 32; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - INSERT_D2_SB(f0, f1, frm1); - INSERT_D2_SB(f2, f3, frm3); - ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - UNPCK_UB_SH(frm2, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - UNPCK_UB_SH(frm4, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - } -} - -static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride, - uint8_t *frm2_ptr, - int32_t filt_sth, int32_t filt_wgt, - uint32_t *acc, uint16_t *cnt) { - uint32_t row; - v16i8 frm1, frm2, frm3, frm4; - v16u8 frm_r, frm_l; - v16i8 zero = { 0 }; - v8u16 frm2_r, frm2_l; - v8i16 diff0, diff1, mod0_h, mod1_h; - v4i32 cnst3, cnst16, filt_wt, strength; - v4i32 mod0_w, mod1_w, mod2_w, mod3_w; - v4i32 diff0_r, diff0_l, diff1_r, diff1_l; - v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; - v4i32 acc0, acc1, acc2, acc3; - v8i16 cnt0, cnt1; - - filt_wt = __msa_fill_w(filt_wgt); - strength = __msa_fill_w(filt_sth); - cnst3 = __msa_ldi_w(3); - cnst16 = __msa_ldi_w(16); - - for (row = 8; row--;) { - LD_SB2(frm1_ptr, stride, frm1, frm3); - frm1_ptr += stride; - - LD_SB2(frm2_ptr, 16, frm2, frm4); - frm2_ptr += 16; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - frm1_ptr += stride; - frm2_ptr += 16; - } -} - -void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, - uint8_t *frame2_ptr, uint32_t blk_w, - uint32_t blk_h, int32_t strength, - int32_t filt_wgt, uint32_t *accu, - uint16_t *cnt) { - if (8 == (blk_w * blk_h)) { - temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength, - filt_wgt, accu, cnt); - } else if (16 == (blk_w * blk_h)) { - temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength, - filt_wgt, accu, cnt); - } else { - vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, - strength, filt_wgt, accu, cnt); - } -} diff --git a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.c index 3aeefb5845..acc3764c7a 100644 --- a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.c +++ b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.c @@ -15,7 +15,7 @@ struct ALT_REF_AQ { int dummy; }; -struct ALT_REF_AQ *vp9_alt_ref_aq_create() { +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) { return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ)); } diff --git a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h index 18acd8a85b..e508cb44ac 100644 --- a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h +++ b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h @@ -54,7 +54,7 @@ struct ALT_REF_AQ; * * \return Instance of the class */ -struct ALT_REF_AQ *vp9_alt_ref_aq_create(); +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void); /*!\brief Upload segmentation_map to self object * diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index 072d92e4e9..2f2f0055a7 100644 --- a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -127,22 +127,16 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i, const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int bits_per_mb; - int num8x8bl = cm->MBs << 2; - // Weight for segment prior to encoding: take the average of the target - // number for the frame to be encoded and the actual from the previous frame. - int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; - double weight_segment = - (double)((target_refresh + cr->actual_num_seg1_blocks + - cr->actual_num_seg2_blocks) >> - 1) / - num8x8bl; - // Compute delta-q corresponding to qindex i. - int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + int deltaq = 0; + if (cpi->oxcf.speed < 8) + deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + else + deltaq = -(cr->max_qdelta_perc * i) / 200; // Take segment weighted average for bits per mb. - bits_per_mb = (int)((1.0 - weight_segment) * + bits_per_mb = (int)((1.0 - cr->weight_segment) * vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor, cm->bit_depth) + - weight_segment * + cr->weight_segment * vp9_rc_bits_per_mb(cm->frame_type, i + deltaq, correction_factor, cm->bit_depth)); return bits_per_mb; @@ -246,24 +240,66 @@ void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi, } } -// Update the actual number of blocks that were applied the segment delta q. +// From the just encoded frame: update the actual number of blocks that were +// applied the segment delta q, and the amount of low motion in the frame. +// Also check conditions for forcing golden update, or preventing golden +// update if the period is up. void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; + MODE_INFO **mi = cm->mi_grid_visible; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + RATE_CONTROL *const rc = &cpi->rc; unsigned char *const seg_map = cpi->segmentation_map; + double fraction_low = 0.0; + int force_gf_refresh = 0; + int low_content_frame = 0; int mi_row, mi_col; cr->actual_num_seg1_blocks = 0; cr->actual_num_seg2_blocks = 0; - for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { - if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) == - CR_SEGMENT_ID_BOOST1) + MV mv = mi[0]->mv[0].as_mv; + int map_index = mi_row * cm->mi_cols + mi_col; + if (cyclic_refresh_segment_id(seg_map[map_index]) == CR_SEGMENT_ID_BOOST1) cr->actual_num_seg1_blocks++; - else if (cyclic_refresh_segment_id( - seg_map[mi_row * cm->mi_cols + mi_col]) == + else if (cyclic_refresh_segment_id(seg_map[map_index]) == CR_SEGMENT_ID_BOOST2) cr->actual_num_seg2_blocks++; + // Accumulate low_content_frame. + if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16) + low_content_frame++; + mi++; } + mi += 8; + } + // Check for golden frame update: only for non-SVC and non-golden boost. + if (!cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 && + !cpi->oxcf.gf_cbr_boost_pct) { + // Force this frame as a golden update frame if this frame changes the + // resolution (resize_pending != 0). + if (cpi->resize_pending != 0) { + vp9_cyclic_refresh_set_golden_update(cpi); + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + force_gf_refresh = 1; + } + // Update average of low content/motion in the frame. + fraction_low = (double)low_content_frame / (cm->mi_rows * cm->mi_cols); + cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4; + if (!force_gf_refresh && cpi->refresh_golden_frame == 1 && + rc->frames_since_key > rc->frames_since_golden + 1) { + // Don't update golden reference if the amount of low_content for the + // current encoded frame is small, or if the recursive average of the + // low_content over the update interval window falls below threshold. + if (fraction_low < 0.65 || cr->low_content_avg < 0.6) { + cpi->refresh_golden_frame = 0; + } + // Reset for next internal. + cr->low_content_avg = fraction_low; + } + } } // Set golden frame update interval, for non-svc 1 pass CBR mode. @@ -278,72 +314,8 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) { else rc->baseline_gf_interval = 40; if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20; -} - -// Update some encoding stats (from the just encoded frame). If this frame's -// background has high motion, refresh the golden frame. Otherwise, if the -// golden reference is to be updated check if we should NOT update the golden -// ref. -void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) { - VP9_COMMON *const cm = &cpi->common; - CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; - int mi_row, mi_col; - double fraction_low = 0.0; - int low_content_frame = 0; - MODE_INFO **mi = cm->mi_grid_visible; - RATE_CONTROL *const rc = &cpi->rc; - const int rows = cm->mi_rows, cols = cm->mi_cols; - int cnt1 = 0, cnt2 = 0; - int force_gf_refresh = 0; - int flag_force_gf_high_motion = 0; - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - if (flag_force_gf_high_motion == 1) { - int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 - ? mi[0]->mv[0].as_mv.row - : -1 * mi[0]->mv[0].as_mv.row; - int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 - ? mi[0]->mv[0].as_mv.col - : -1 * mi[0]->mv[0].as_mv.col; - // Calculate the motion of the background. - if (abs_mvr <= 16 && abs_mvc <= 16) { - cnt1++; - if (abs_mvr == 0 && abs_mvc == 0) cnt2++; - } - } - mi++; - // Accumulate low_content_frame. - if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++; - } - mi += 8; - } - // For video conference clips, if the background has high motion in current - // frame because of the camera movement, set this frame as the golden frame. - // Use 70% and 5% as the thresholds for golden frame refreshing. - // Also, force this frame as a golden update frame if this frame will change - // the resolution (resize_pending != 0). - if (cpi->resize_pending != 0 || - (cnt1 * 100 > (70 * rows * cols) && cnt2 * 20 < cnt1)) { - vp9_cyclic_refresh_set_golden_update(cpi); - rc->frames_till_gf_update_due = rc->baseline_gf_interval; - - if (rc->frames_till_gf_update_due > rc->frames_to_key) - rc->frames_till_gf_update_due = rc->frames_to_key; - cpi->refresh_golden_frame = 1; - force_gf_refresh = 1; - } - fraction_low = (double)low_content_frame / (rows * cols); - // Update average. - cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4; - if (!force_gf_refresh && cpi->refresh_golden_frame == 1) { - // Don't update golden reference if the amount of low_content for the - // current encoded frame is small, or if the recursive average of the - // low_content over the update interval window falls below threshold. - if (fraction_low < 0.8 || cr->low_content_avg < 0.7) - cpi->refresh_golden_frame = 0; - // Reset for next internal. - cr->low_content_avg = fraction_low; - } + if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40) + rc->baseline_gf_interval = 10; } // Update the segmentation map, and related quantities: cyclic refresh map, @@ -383,13 +355,14 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex); // More aggressive settings for noisy content. if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) { - consec_zero_mv_thresh = 80; + consec_zero_mv_thresh = 60; qindex_thresh = VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex), - 7 * cm->base_qindex >> 3); + cm->base_qindex); } do { int sum_map = 0; + int consec_zero_mv_thresh_block = consec_zero_mv_thresh; // Get the mi_row/mi_col corresponding to superblock index i. int sb_row_index = (i / sb_cols); int sb_col_index = i - sb_row_index * sb_cols; @@ -403,16 +376,19 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]); ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]); + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium && + (xmis <= 2 || ymis <= 2)) + consec_zero_mv_thresh_block = 4; for (y = 0; y < ymis; y++) { for (x = 0; x < xmis; x++) { const int bl_index2 = bl_index + y * cm->mi_cols + x; // If the block is as a candidate for clean up then mark it // for possible boost/refresh (segment 1). The segment id may get - // reset to 0 later if block gets coded anything other than ZEROMV. + // reset to 0 later depending on the coding mode. if (cr->map[bl_index2] == 0) { count_tot++; if (cr->last_coded_q_map[bl_index2] > qindex_thresh || - cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) { + cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) { sum_map++; count_sel++; } @@ -445,9 +421,21 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int num8x8bl = cm->MBs << 2; + int target_refresh = 0; + double weight_segment_target = 0; + double weight_segment = 0; + int thresh_low_motion = (cm->width < 720) ? 55 : 20; + cr->apply_cyclic_refresh = 1; + if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 || + (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion && + rc->frames_since_key > 40)) { + cr->apply_cyclic_refresh = 0; + return; + } cr->percent_refresh = 10; if (cr->reduce_refresh) cr->percent_refresh = 5; - cr->max_qdelta_perc = 50; + cr->max_qdelta_perc = 60; cr->time_for_refresh = 0; cr->motion_thresh = 32; cr->rate_boost_fac = 15; @@ -466,10 +454,15 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_boost_fac = 13; } } - // Adjust some parameters for low resolutions at low bitrates. - if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) { - cr->motion_thresh = 4; - cr->rate_boost_fac = 10; + // Adjust some parameters for low resolutions. + if (cm->width <= 352 && cm->height <= 288) { + if (rc->avg_frame_bandwidth < 3000) { + cr->motion_thresh = 16; + cr->rate_boost_fac = 13; + } else { + cr->max_qdelta_perc = 70; + cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5); + } } if (cpi->svc.spatial_layer_id > 0) { cr->motion_thresh = 4; @@ -487,6 +480,19 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_ratio_qdelta = 1.0; } } + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + // Use the target if its less. To be used for setting the base qp for the + // frame in vp9_rc_regulate_q. + target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + weight_segment_target = (double)(target_refresh) / num8x8bl; + weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num8x8bl; + if (weight_segment_target < 7 * weight_segment / 8) + weight_segment = weight_segment_target; + cr->weight_segment = weight_segment; } // Setup cyclic background refresh: set delta q and segmentation map. @@ -495,14 +501,8 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; struct segmentation *const seg = &cm->seg; - // TODO(marpan): Look into whether we should reduce the amount/delta-qp - // instead of completely shutting off at low bitrates. For now keep it on. - // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc); - const int apply_cyclic_refresh = 1; if (cm->current_video_frame == 0) cr->low_content_avg = 0.0; - // Don't apply refresh on key frame or temporal enhancement layer frames. - if (!apply_cyclic_refresh || (cm->frame_type == KEY_FRAME) || - (cpi->force_update_segmentation) || (cpi->svc.temporal_layer_id > 0)) { + if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) { // Set segmentation map to 0 and disable. unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); @@ -511,6 +511,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; + cr->reduce_refresh = 0; } return; } else { diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h index a4be031295..77fa67c9e1 100644 --- a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -66,6 +66,8 @@ struct CYCLIC_REFRESH { double low_content_avg; int qindex_delta[3]; int reduce_refresh; + double weight_segment; + int apply_cyclic_refresh; }; struct VP9_COMP; @@ -104,15 +106,15 @@ void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi, // refresh sb_index, and target number of blocks to be refreshed. void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi); -// Update the actual number of blocks that were applied the segment delta q. +// From the just encoded frame: update the actual number of blocks that were +// applied the segment delta q, and the amount of low motion in the frame. +// Also check conditions for forcing golden update, or preventing golden +// update if the period is up. void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi); // Set golden frame update interval, for non-svc 1 pass CBR mode. void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi); -// Check if we should not update golden reference, based on past refresh stats. -void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi); - // Set/update global/frame level refresh parameters. void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi); diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.c b/libs/libvpx/vp9/encoder/vp9_bitstream.c index 860da83330..d346cd57aa 100644 --- a/libs/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libs/libvpx/vp9/encoder/vp9_bitstream.c @@ -80,8 +80,8 @@ static void prob_diff_update(const vpx_tree_index *tree, vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]); } -static void write_selected_tx_size(const VP9_COMMON *cm, const MACROBLOCKD *xd, - vpx_writer *w) { +static void write_selected_tx_size(const VP9_COMMON *cm, + const MACROBLOCKD *const xd, vpx_writer *w) { TX_SIZE tx_size = xd->mi[0]->tx_size; BLOCK_SIZE bsize = xd->mi[0]->sb_type; const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; @@ -95,7 +95,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, const MACROBLOCKD *xd, } } -static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd, +static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *const xd, int segment_id, const MODE_INFO *mi, vpx_writer *w) { if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; @@ -195,7 +195,7 @@ static void write_segment_id(vpx_writer *w, const struct segmentation *seg, } // This function encodes the reference frame -static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, +static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd, vpx_writer *w) { const MODE_INFO *const mi = xd->mi[0]; const int is_compound = has_second_ref(mi); @@ -213,7 +213,7 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, if (cm->reference_mode == REFERENCE_MODE_SELECT) { vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd)); } else { - assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE)); + assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE)); } if (is_compound) { @@ -230,14 +230,15 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, } } -static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, - vpx_writer *w) { +static void pack_inter_mode_mvs( + VP9_COMP *cpi, const MACROBLOCKD *const xd, + const MB_MODE_INFO_EXT *const mbmi_ext, vpx_writer *w, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) { VP9_COMMON *const cm = &cpi->common; const nmv_context *nmvc = &cm->fc->nmvc; - const MACROBLOCK *const x = &cpi->td.mb; - const MACROBLOCKD *const xd = &x->e_mbd; const struct segmentation *const seg = &cm->seg; - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const MODE_INFO *const mi = xd->mi[0]; const PREDICTION_MODE mode = mi->mode; const int segment_id = mi->segment_id; const BLOCK_SIZE bsize = mi->sb_type; @@ -299,7 +300,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, vp9_write_token(w, vp9_switchable_interp_tree, cm->fc->switchable_interp_prob[ctx], &switchable_interp_encodings[mi->interp_filter]); - ++cpi->interp_filter_selected[0][mi->interp_filter]; + ++interp_filter_selected[0][mi->interp_filter]; } else { assert(mi->interp_filter == cm->interp_filter); } @@ -317,7 +318,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, for (ref = 0; ref < 1 + is_compound; ++ref) vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv, &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, - nmvc, allow_hp); + nmvc, allow_hp, max_mv_magnitude); } } } @@ -326,16 +327,16 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, for (ref = 0; ref < 1 + is_compound; ++ref) vp9_encode_mv(cpi, w, &mi->mv[ref].as_mv, &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, nmvc, - allow_hp); + allow_hp, max_mv_magnitude); } } } } static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, - MODE_INFO **mi_8x8, vpx_writer *w) { + vpx_writer *w) { const struct segmentation *const seg = &cm->seg; - const MODE_INFO *const mi = mi_8x8[0]; + const MODE_INFO *const mi = xd->mi[0]; const MODE_INFO *const above_mi = xd->above_mi; const MODE_INFO *const left_mi = xd->left_mi; const BLOCK_SIZE bsize = mi->sb_type; @@ -366,27 +367,27 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]); } -static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, - vpx_writer *w, TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end, int mi_row, - int mi_col) { +static void write_modes_b( + VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile, + vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + int mi_row, int mi_col, unsigned int *const max_mv_magnitude, + int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const MB_MODE_INFO_EXT *const mbmi_ext = + cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); MODE_INFO *m; xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); m = xd->mi[0]; - cpi->td.mb.mbmi_ext = - cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); - set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->sb_type], mi_col, num_8x8_blocks_wide_lookup[m->sb_type], cm->mi_rows, cm->mi_cols); if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cm, xd, xd->mi, w); + write_mb_modes_kf(cm, xd, w); } else { - pack_inter_mode_mvs(cpi, m, w); + pack_inter_mode_mvs(cpi, xd, mbmi_ext, w, max_mv_magnitude, + interp_filter_selected); } assert(*tok < tok_end); @@ -415,13 +416,13 @@ static void write_partition(const VP9_COMMON *const cm, } } -static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, - vpx_writer *w, TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end, int mi_row, - int mi_col, BLOCK_SIZE bsize) { +static void write_modes_sb( + VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile, + vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + int mi_row, int mi_col, BLOCK_SIZE bsize, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; - const int bsl = b_width_log2_lookup[bsize]; const int bs = (1 << bsl) / 4; PARTITION_TYPE partition; @@ -436,30 +437,37 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w); subsize = get_subsize(bsize, partition); if (subsize < BLOCK_8X8) { - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); } else { switch (partition) { case PARTITION_NONE: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); break; case PARTITION_HORZ: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); if (mi_row + bs < cm->mi_rows) - write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col, + max_mv_magnitude, interp_filter_selected); break; case PARTITION_VERT: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); if (mi_col + bs < cm->mi_cols) - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, + max_mv_magnitude, interp_filter_selected); break; case PARTITION_SPLIT: - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs, - subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col, - subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, - subsize); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize, + max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, + subsize, max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col, + subsize, max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, + subsize, max_mv_magnitude, interp_filter_selected); break; default: assert(0); } @@ -471,21 +479,32 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, update_partition_context(xd, mi_row, mi_col, subsize, bsize); } -static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, - vpx_writer *w, TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end) { +static void write_modes( + VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile, + vpx_writer *w, int tile_row, int tile_col, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; - int mi_row, mi_col; + int mi_row, mi_col, tile_sb_row; + TOKENEXTRA *tok = NULL; + TOKENEXTRA *tok_end = NULL; set_partition_probs(cm, xd); for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { + tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile->mi_row_start) >> + MI_BLOCK_SIZE_LOG2; + tok = cpi->tplist[tile_row][tile_col][tile_sb_row].start; + tok_end = tok + cpi->tplist[tile_row][tile_col][tile_sb_row].count; + vp9_zero(xd->left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64); + write_modes_sb(cpi, xd, tile, w, &tok, tok_end, mi_row, mi_col, + BLOCK_64X64, max_mv_magnitude, interp_filter_selected); + + assert(tok == cpi->tplist[tile_row][tile_col][tile_sb_row].stop); } } @@ -900,11 +919,130 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) { } } +static int encode_tile_worker(void *arg1, void *arg2) { + VP9_COMP *cpi = (VP9_COMP *)arg1; + VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2; + MACROBLOCKD *const xd = &data->xd; + const int tile_row = 0; + vpx_start_encode(&data->bit_writer, data->dest); + write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info, + &data->bit_writer, tile_row, data->tile_idx, + &data->max_mv_magnitude, data->interp_filter_selected); + vpx_stop_encode(&data->bit_writer); + return 1; +} + +void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) { + if (cpi->vp9_bitstream_worker_data) { + int i; + for (i = 1; i < cpi->num_workers; ++i) { + vpx_free(cpi->vp9_bitstream_worker_data[i].dest); + } + vpx_free(cpi->vp9_bitstream_worker_data); + cpi->vp9_bitstream_worker_data = NULL; + } +} + +static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) { + int i; + const size_t worker_data_size = + cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data); + cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size); + memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size); + if (!cpi->vp9_bitstream_worker_data) return 1; + for (i = 1; i < cpi->num_workers; ++i) { + cpi->vp9_bitstream_worker_data[i].dest_size = + cpi->oxcf.width * cpi->oxcf.height; + cpi->vp9_bitstream_worker_data[i].dest = + vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size); + if (!cpi->vp9_bitstream_worker_data[i].dest) return 1; + } + return 0; +} + +static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = cpi->num_workers; + size_t total_size = 0; + int tile_col = 0; + + if (!cpi->vp9_bitstream_worker_data || + cpi->vp9_bitstream_worker_data[1].dest_size > + (cpi->oxcf.width * cpi->oxcf.height)) { + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + if (encode_tiles_buffer_alloc(cpi)) return 0; + } + + while (tile_col < tile_cols) { + int i, j; + for (i = 0; i < num_workers && tile_col < tile_cols; ++i) { + VPxWorker *const worker = &cpi->workers[i]; + VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i]; + + // Populate the worker data. + data->xd = cpi->td.mb.e_mbd; + data->tile_idx = tile_col; + data->max_mv_magnitude = cpi->max_mv_magnitude; + memset(data->interp_filter_selected, 0, + sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE); + + // First thread can directly write into the output buffer. + if (i == 0) { + // If this worker happens to be for the last tile, then do not offset it + // by 4 for the tile size. + data->dest = + data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4); + } + worker->data1 = cpi; + worker->data2 = data; + worker->hook = encode_tile_worker; + worker->had_error = 0; + + if (i < num_workers - 1) { + winterface->launch(worker); + } else { + winterface->execute(worker); + } + ++tile_col; + } + for (j = 0; j < i; ++j) { + VPxWorker *const worker = &cpi->workers[j]; + VP9BitstreamWorkerData *const data = + (VP9BitstreamWorkerData *)worker->data2; + uint32_t tile_size; + int k; + + if (!winterface->sync(worker)) return 0; + tile_size = data->bit_writer.pos; + + // Aggregate per-thread bitstream stats. + cpi->max_mv_magnitude = + VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude); + for (k = 0; k < SWITCHABLE; ++k) { + cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k]; + } + + // Prefix the size of the tile on all but the last. + if (tile_col != tile_cols || j < i - 1) { + mem_put_be32(data_ptr + total_size, tile_size); + total_size += 4; + } + if (j > 0) { + memcpy(data_ptr + total_size, data->dest, tile_size); + } + total_size += tile_size; + } + } + return total_size; +} + static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; vpx_writer residual_bc; int tile_row, tile_col; - TOKENEXTRA *tok_end; size_t total_size = 0; const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; @@ -912,22 +1050,27 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols)); + // Encoding tiles in parallel is done only for realtime mode now. In other + // modes the speed up is insignificant and requires further testing to ensure + // that it does not make the overall process worse in any case. + if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 && + tile_cols > 1) { + return encode_tiles_mt(cpi, data_ptr); + } + for (tile_row = 0; tile_row < tile_rows; tile_row++) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_idx = tile_row * tile_cols + tile_col; - TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; - - tok_end = cpi->tile_tok[tile_row][tile_col] + - cpi->tok_count[tile_row][tile_col]; if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) vpx_start_encode(&residual_bc, data_ptr + total_size + 4); else vpx_start_encode(&residual_bc, data_ptr + total_size); - write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, &residual_bc, &tok, - tok_end); - assert(tok == tok_end); + write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc, + tile_row, tile_col, &cpi->max_mv_magnitude, + cpi->interp_filter_selected); + vpx_stop_encode(&residual_bc); if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { // size of this tile @@ -938,7 +1081,6 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { total_size += residual_bc.pos; } } - return total_size; } diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.h b/libs/libvpx/vp9/encoder/vp9_bitstream.h index 8c97d37f77..339c3fecb1 100644 --- a/libs/libvpx/vp9/encoder/vp9_bitstream.h +++ b/libs/libvpx/vp9/encoder/vp9_bitstream.h @@ -17,8 +17,24 @@ extern "C" { #include "vp9/encoder/vp9_encoder.h" +typedef struct VP9BitstreamWorkerData { + uint8_t *dest; + int dest_size; + vpx_writer bit_writer; + int tile_idx; + unsigned int max_mv_magnitude; + // The size of interp_filter_selected in VP9_COMP is actually + // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do + // is increment the very first index (index 0) for the first dimension. Hence + // this is sufficient. + int interp_filter_selected[1][SWITCHABLE]; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); +} VP9BitstreamWorkerData; + int vp9_get_refresh_mask(VP9_COMP *cpi); +void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi); + void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { diff --git a/libs/libvpx/vp9/encoder/vp9_block.h b/libs/libvpx/vp9/encoder/vp9_block.h index 1ea5fdf1ff..724205dd57 100644 --- a/libs/libvpx/vp9/encoder/vp9_block.h +++ b/libs/libvpx/vp9/encoder/vp9_block.h @@ -11,6 +11,8 @@ #ifndef VP9_ENCODER_VP9_BLOCK_H_ #define VP9_ENCODER_VP9_BLOCK_H_ +#include "vpx_util/vpx_thread.h" + #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" @@ -61,6 +63,11 @@ typedef struct { typedef struct macroblock MACROBLOCK; struct macroblock { +// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054 +#if defined(_MSC_VER) && _MSC_VER < 1900 + int64_t bsse[MAX_MB_PLANE << 2]; +#endif + struct macroblock_plane plane[MAX_MB_PLANE]; MACROBLOCKD e_mbd; @@ -86,8 +93,6 @@ struct macroblock { int rddiv; int rdmult; int mb_energy; - int *m_search_count_ptr; - int *ex_search_count_ptr; // These are set to their default values at the beginning, and then adjusted // further in the encoding process. @@ -118,6 +123,9 @@ struct macroblock { // Set during mode selection. Read during block encoding. uint8_t zcoeff_blk[TX_SIZES][256]; + // Accumulate the tx block eobs in a partition block. + int32_t sum_y_eobs[TX_SIZES]; + int skip; int encode_breakout; @@ -131,6 +139,10 @@ struct macroblock { int use_lp32x32fdct; int skip_encode; + // In first pass, intra prediction is done based on source pixels + // at tile boundaries + int fp_src_pred; + // use fast quantization process int quant_fp; @@ -140,7 +152,10 @@ struct macroblock { #define SKIP_TXFM_AC_DC 1 #define SKIP_TXFM_AC_ONLY 2 +// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054 +#if !defined(_MSC_VER) || _MSC_VER >= 1900 int64_t bsse[MAX_MB_PLANE << 2]; +#endif // Used to store sub partition's choices. MV pred_mv[MAX_REF_FRAMES]; @@ -151,16 +166,38 @@ struct macroblock { uint8_t sb_is_skin; + uint8_t skip_low_source_sad; + + uint8_t lowvar_highsumdiff; + + uint8_t last_sb_high_content; + + int sb_use_mv_part; + + int sb_mvcol_part; + + int sb_mvrow_part; + + int sb_pickmode_part; + + // For each superblock: saves the content value (e.g., low/high sad/sumdiff) + // based on source sad, prior to encoding the frame. + uint8_t content_state_sb; + // Used to save the status of whether a block has a low variance in // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for // 32x32, 9~24 for 16x16. uint8_t variance_low[25]; - void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); - void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); + uint8_t arf_frame_usage; + uint8_t lastgolden_frame_usage; + + void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride); + void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride, + int eob); #if CONFIG_VP9_HIGHBITDEPTH - void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, - int eob, int bd); + void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest, + int stride, int eob, int bd); #endif }; diff --git a/libs/libvpx/vp9/encoder/vp9_context_tree.h b/libs/libvpx/vp9/encoder/vp9_context_tree.h index 86ba03d69f..73423c0758 100644 --- a/libs/libvpx/vp9/encoder/vp9_context_tree.h +++ b/libs/libvpx/vp9/encoder/vp9_context_tree.h @@ -65,12 +65,16 @@ typedef struct { int_mv best_sse_mv; MV_REFERENCE_FRAME best_reference_frame; MV_REFERENCE_FRAME best_zeromv_reference_frame; + int sb_skip_denoising; #endif // motion vector cache for adaptive motion search control in partition // search loop MV pred_mv[MAX_REF_FRAMES]; INTERP_FILTER pred_interp_filter; + + // Used for the machine learning-based early termination + int32_t sum_y_eobs; } PICK_MODE_CONTEXT; typedef struct PC_TREE { diff --git a/libs/libvpx/vp9/encoder/vp9_dct.c b/libs/libvpx/vp9/encoder/vp9_dct.c index bb8c23fdb9..5c66562a56 100644 --- a/libs/libvpx/vp9/encoder/vp9_dct.c +++ b/libs/libvpx/vp9/encoder/vp9_dct.c @@ -556,9 +556,8 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -567,6 +566,8 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, int i, j; tran_low_t intermediate[64]; + (void)iscan; + // Transform columns { tran_low_t *output = intermediate; @@ -632,12 +633,6 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2; } - // TODO(jingning) Decide the need of these arguments after the - // quantization process is completed. - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)iscan; - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.c b/libs/libvpx/vp9/encoder/vp9_denoiser.c index 1d9a6702df..b08ccaa66c 100644 --- a/libs/libvpx/vp9/encoder/vp9_denoiser.c +++ b/libs/libvpx/vp9/encoder/vp9_denoiser.c @@ -185,33 +185,42 @@ static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row, } static VP9_DENOISER_DECISION perform_motion_compensation( - VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, + VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, - int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv) { - int sse_diff = ctx->zeromv_sse - ctx->newmv_sse; - MV_REFERENCE_FRAME frame; + int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv, + int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx, + int use_svc, int spatial_layer) { + const int sse_diff = (ctx->newmv_sse == UINT_MAX) + ? 0 + : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); + int frame; + int denoise_layer_idx = 0; MACROBLOCKD *filter_mbd = &mb->e_mbd; MODE_INFO *mi = filter_mbd->mi[0]; MODE_INFO saved_mi; int i; struct buf_2d saved_dst[MAX_MB_PLANE]; struct buf_2d saved_pre[MAX_MB_PLANE]; + RefBuffer *saved_block_refs[2]; + MV_REFERENCE_FRAME saved_frame; frame = ctx->best_reference_frame; + saved_mi = *mi; if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK; - // Avoid denoising for small block (unless motion is small). - // Small blocks are selected in variance partition (before encoding) and - // will typically lie on moving areas. - if (denoiser->denoising_level < kDenHigh && motion_magnitude > 16 && - bs <= BLOCK_8X8) + // Avoid denoising small blocks. When noise > kDenLow or frame width > 480, + // denoise 16x16 blocks. + if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 || + (bs == BLOCK_16X16 && width > 480 && + denoiser->denoising_level <= kDenLow)) return COPY_BLOCK; // If the best reference frame uses inter-prediction and there is enough of a // difference in sum-squared-error, use it. - if (frame != INTRA_FRAME && ctx->newmv_sse != UINT_MAX && + if (frame != INTRA_FRAME && frame != ALTREF_FRAME && + (frame != GOLDEN_FRAME || num_spatial_layers == 1) && sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { mi->ref_frame[0] = ctx->best_reference_frame; mi->mode = ctx->best_sse_inter_mode; @@ -221,9 +230,10 @@ static VP9_DENOISER_DECISION perform_motion_compensation( frame = ctx->best_zeromv_reference_frame; ctx->newmv_sse = ctx->zeromv_sse; // Bias to last reference. - if (frame != LAST_FRAME && - ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) || - denoiser->denoising_level >= kDenHigh)) { + if (num_spatial_layers > 1 || frame == ALTREF_FRAME || + (frame != LAST_FRAME && + ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) || + denoiser->denoising_level >= kDenHigh))) { frame = LAST_FRAME; ctx->newmv_sse = ctx->zeromv_lastref_sse; } @@ -238,6 +248,19 @@ static VP9_DENOISER_DECISION perform_motion_compensation( } } + saved_frame = frame; + // When using SVC, we need to map REF_FRAME to the frame buffer index. + if (use_svc) { + if (frame == LAST_FRAME) + frame = lst_fb_idx + 1; + else if (frame == GOLDEN_FRAME) + frame = gld_fb_idx + 1; + // Shift for the second spatial layer. + if (num_spatial_layers - spatial_layer == 2) + frame = frame + denoiser->num_ref_frames; + denoise_layer_idx = num_spatial_layers - spatial_layer - 1; + } + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { // Restore everything to its original state *mi = saved_mi; @@ -254,6 +277,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( saved_pre[i] = filter_mbd->plane[i].pre[0]; saved_dst[i] = filter_mbd->plane[i].dst; } + saved_block_refs[0] = filter_mbd->block_refs[0]; // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser // struct. @@ -270,23 +294,28 @@ static VP9_DENOISER_DECISION perform_motion_compensation( denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col); filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride; - filter_mbd->plane[0].dst.buf = - block_start(denoiser->mc_running_avg_y.y_buffer, - denoiser->mc_running_avg_y.y_stride, mi_row, mi_col); - filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride; - filter_mbd->plane[1].dst.buf = - block_start(denoiser->mc_running_avg_y.u_buffer, - denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col); - filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride; - filter_mbd->plane[2].dst.buf = - block_start(denoiser->mc_running_avg_y.v_buffer, - denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col); - filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride; + filter_mbd->plane[0].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col); + filter_mbd->plane[0].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].y_stride; + filter_mbd->plane[1].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col); + filter_mbd->plane[1].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; + filter_mbd->plane[2].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col); + filter_mbd->plane[2].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; + set_ref_ptrs(cm, filter_mbd, saved_frame, NONE); vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs); // Restore everything to its original state *mi = saved_mi; + filter_mbd->block_refs[0] = saved_block_refs[0]; for (i = 0; i < MAX_MB_PLANE; ++i) { filter_mbd->plane[i].pre[0] = saved_pre[i]; filter_mbd->plane[i].dst = saved_dst[i]; @@ -303,13 +332,22 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, int zeromv_filter = 0; VP9_DENOISER *denoiser = &cpi->denoiser; VP9_DENOISER_DECISION decision = COPY_BLOCK; - YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME]; - YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y; + + const int shift = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 + ? denoiser->num_ref_frames + : 0; + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift]; + const int denoise_layer_index = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1; + YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index]; uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); + uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col); struct buf_2d src = mb->plane[0].src; int is_skin = 0; + int increase_denoising = 0; int consec_zeromv = 0; mv_col = ctx->best_sse_mv.as_mv.col; mv_row = ctx->best_sse_mv.as_mv.row; @@ -326,8 +364,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, VP9_COMMON *const cm = &cpi->common; int j, i; // Loop through the 8x8 sub-blocks. - const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; - const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int bw = num_8x8_blocks_wide_lookup[bs]; + const int bh = num_8x8_blocks_high_lookup[bs]; const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); const int block_index = mi_row * cm->mi_cols + mi_col; @@ -352,30 +390,28 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv, motion_level); } - if (!is_skin && denoiser->denoising_level == kDenHigh) { - denoiser->increase_denoising = 1; - } else { - denoiser->increase_denoising = 0; - } + if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1; - if (denoiser->denoising_level >= kDenLow) + if (denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising) decision = perform_motion_compensation( - denoiser, mb, bs, denoiser->increase_denoising, mi_row, mi_col, ctx, - motion_magnitude, is_skin, &zeromv_filter, consec_zeromv); + &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, + motion_magnitude, is_skin, &zeromv_filter, consec_zeromv, + cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx, + cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id); if (decision == FILTER_BLOCK) { - decision = vp9_denoiser_filter( - src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start, - avg.y_stride, denoiser->increase_denoising, bs, motion_magnitude); + decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start, + mc_avg.y_stride, avg_start, avg.y_stride, + increase_denoising, bs, motion_magnitude); } if (decision == FILTER_BLOCK) { - vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, - NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, + vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } else { // COPY_BLOCK - vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, - NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, + vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } *denoiser_decision = decision; @@ -408,19 +444,23 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, src->y_buffer = tmp_buf; } -void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, - YV12_BUFFER_CONFIG src, - FRAME_TYPE frame_type, - int refresh_alt_ref_frame, - int refresh_golden_frame, - int refresh_last_frame, int resized) { +void vp9_denoiser_update_frame_info( + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, + int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, + int svc_base_is_key, int second_spatial_layer) { + const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; // Copy source into denoised reference buffers on KEY_FRAME or - // if the just encoded frame was resized. - if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset) { + // if the just encoded frame was resized. For SVC, copy source if the base + // spatial layer was key frame. + if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset || + svc_base_is_key) { int i; // Start at 1 so as not to overwrite the INTRA_FRAME - for (i = 1; i < MAX_REF_FRAMES; ++i) - copy_frame(&denoiser->running_avg_y[i], &src); + for (i = 1; i < denoiser->num_ref_frames; ++i) { + if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL) + copy_frame(&denoiser->running_avg_y[i + shift], &src); + } denoiser->reset = 0; return; } @@ -428,29 +468,29 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, // If more than one refresh occurs, must copy frame buffer. if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) { if (refresh_alt_ref_frame) { - copy_frame(&denoiser->running_avg_y[ALTREF_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_golden_frame) { - copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_last_frame) { - copy_frame(&denoiser->running_avg_y[LAST_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } } else { if (refresh_alt_ref_frame) { - swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_golden_frame) { - swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_last_frame) { - swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } } } @@ -479,19 +519,110 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, } } -int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, - int ssy, +static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm, + VP9_DENOISER *denoiser, int fb_idx) { + int fail = 0; + if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) { + fail = + vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width, + cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, 0); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } + } + return 0; +} + +int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, + int svc_buf_shift, int refresh_alt, + int refresh_gld, int refresh_lst, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx) { + int fail = 0; + if (refresh_alt) { + // Increase the frame buffer index by 1 to map it to the buffer index in the + // denoiser. + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + alt_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_gld) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + gld_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_lst) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + lst_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + return 0; +} + +int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth, #endif int border) { - int i, fail; + int i, layer, fail, init_num_ref_frames; const int legacy_byte_alignment = 0; + int num_layers = 1; + int scaled_width = width; + int scaled_height = height; + if (use_svc) { + LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + svc->temporal_layer_id]; + get_layer_resolution(width, height, lc->scaling_factor_num, + lc->scaling_factor_den, &scaled_width, &scaled_height); + // For SVC: only denoise at most 2 spatial (highest) layers. + if (noise_sen >= 2) + // Denoise from one spatial layer below the top. + svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0); + else + // Only denoise the top spatial layer. + svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0); + num_layers = svc->number_spatial_layers - svc->first_layer_denoise; + } assert(denoiser != NULL); + denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; + init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES; + denoiser->num_layers = num_layers; + CHECK_MEM_ERROR(cm, denoiser->running_avg_y, + vpx_calloc(denoiser->num_ref_frames * num_layers, + sizeof(denoiser->running_avg_y[0]))); + CHECK_MEM_ERROR( + cm, denoiser->mc_running_avg_y, + vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); - for (i = 0; i < MAX_REF_FRAMES; ++i) { - fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height, - ssx, ssy, + for (layer = 0; layer < num_layers; ++layer) { + const int denoise_width = (layer == 0) ? width : scaled_width; + const int denoise_height = (layer == 0) ? height : scaled_height; + for (i = 0; i < init_num_ref_frames; ++i) { + fail = vpx_alloc_frame_buffer( + &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], + denoise_width, denoise_height, ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, legacy_byte_alignment); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + } + + fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer], + denoise_width, denoise_height, ssx, ssy, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif @@ -500,22 +631,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, vp9_denoiser_free(denoiser); return 1; } -#ifdef OUTPUT_YUV_DENOISED - make_grayscale(&denoiser->running_avg_y[i]); -#endif - } - - fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx, - ssy, -#if CONFIG_VP9_HIGHBITDEPTH - use_highbitdepth, -#endif - border, legacy_byte_alignment); - if (fail) { - vp9_denoiser_free(denoiser); - return 1; } + // denoiser->last_source only used for noise_estimation, so only for top + // layer. fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, @@ -528,7 +647,6 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, #ifdef OUTPUT_YUV_DENOISED make_grayscale(&denoiser->running_avg_y[i]); #endif - denoiser->increase_denoising = 0; denoiser->frame_buffer_initialized = 1; denoiser->denoising_level = kDenLow; denoiser->prev_denoising_level = kDenLow; @@ -542,10 +660,18 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) { return; } denoiser->frame_buffer_initialized = 0; - for (i = 0; i < MAX_REF_FRAMES; ++i) { + for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) { vpx_free_frame_buffer(&denoiser->running_avg_y[i]); } - vpx_free_frame_buffer(&denoiser->mc_running_avg_y); + vpx_free(denoiser->running_avg_y); + denoiser->running_avg_y = NULL; + + for (i = 0; i < denoiser->num_layers; ++i) { + vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]); + } + + vpx_free(denoiser->mc_running_avg_y); + denoiser->mc_running_avg_y = NULL; vpx_free_frame_buffer(&denoiser->last_source); } @@ -559,6 +685,34 @@ void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) { denoiser->prev_denoising_level = denoiser->denoising_level; } +// Scale/increase the partition threshold +// for denoiser speed-up. +int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, + int content_state, int temporal_layer_id) { + if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff) || (noise_level == kDenHigh) || + (temporal_layer_id != 0)) { + int64_t scaled_thr = + (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2; + return scaled_thr; + } else { + return (5 * threshold) >> 2; + } +} + +// Scale/increase the ac skip threshold for +// denoiser speed-up. +int64_t vp9_scale_acskip_thresh(int64_t threshold, + VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id) { + if (noise_level >= kDenLow && abs_sumdiff < 5) + return threshold *= + (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6; + else + return threshold; +} + #ifdef OUTPUT_YUV_DENOISED static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { int r, c; diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.h b/libs/libvpx/vp9/encoder/vp9_denoiser.h index fcfaa5051a..f4da24cbf6 100644 --- a/libs/libvpx/vp9/encoder/vp9_denoiser.h +++ b/libs/libvpx/vp9/encoder/vp9_denoiser.h @@ -21,6 +21,14 @@ extern "C" { #define MOTION_MAGNITUDE_THRESHOLD (8 * 3) +// Denoiser is used in non svc real-time mode which does not use alt-ref, so no +// need to allocate for it, and hence we need MAX_REF_FRAME - 1 +#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1 + +// Number of frame buffers when SVC is used. [0] for current denoised buffer and +// [1..8] for REF_FRAMES +#define SVC_REF_FRAMES 9 + typedef enum vp9_denoiser_decision { COPY_BLOCK, FILTER_BLOCK, @@ -35,12 +43,13 @@ typedef enum vp9_denoiser_level { } VP9_DENOISER_LEVEL; typedef struct vp9_denoiser { - YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; - YV12_BUFFER_CONFIG mc_running_avg_y; + YV12_BUFFER_CONFIG *running_avg_y; + YV12_BUFFER_CONFIG *mc_running_avg_y; YV12_BUFFER_CONFIG last_source; - int increase_denoising; int frame_buffer_initialized; int reset; + int num_ref_frames; + int num_layers; VP9_DENOISER_LEVEL denoising_level; VP9_DENOISER_LEVEL prev_denoising_level; } VP9_DENOISER; @@ -58,13 +67,13 @@ typedef struct { } VP9_PICKMODE_CTX_DEN; struct VP9_COMP; +struct SVC; -void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, - YV12_BUFFER_CONFIG src, - FRAME_TYPE frame_type, - int refresh_alt_ref_frame, - int refresh_golden_frame, - int refresh_last_frame, int resized); +void vp9_denoiser_update_frame_info( + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, + int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, + int svc_base_is_key, int second_spatial_layer); void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, @@ -76,8 +85,14 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, PREDICTION_MODE mode, PICK_MODE_CONTEXT *ctx); -int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, - int ssy, +int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, + int svc_buf_shift, int refresh_alt, + int refresh_gld, int refresh_lst, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx); + +int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth, #endif @@ -97,6 +112,13 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser); void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level); +int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, + int content_state, int temporal_layer_id); + +int64_t vp9_scale_acskip_thresh(int64_t threshold, + VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.c b/libs/libvpx/vp9/encoder/vp9_encodeframe.c index 335faca82b..682477df18 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.c @@ -52,6 +52,33 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int output_enabled, int mi_row, int mi_col, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); +// Machine learning-based early termination parameters. +static const double train_mean[24] = { + 303501.697372, 3042630.372158, 24.694696, 1.392182, + 689.413511, 162.027012, 1.478213, 0.0, + 135382.260230, 912738.513263, 28.845217, 1.515230, + 544.158492, 131.807995, 1.436863, 0.0, + 43682.377587, 208131.711766, 28.084737, 1.356677, + 138.254122, 119.522553, 1.252322, 0.0 +}; + +static const double train_stdm[24] = { + 673689.212982, 5996652.516628, 0.024449, 1.989792, + 985.880847, 0.014638, 2.001898, 0.0, + 208798.775332, 1812548.443284, 0.018693, 1.838009, + 396.986910, 0.015657, 1.332541, 0.0, + 55888.847031, 448587.962714, 0.017900, 1.904776, + 98.652832, 0.016598, 1.320992, 0.0 +}; + +// Error tolerance: 0.01%-0.0.05%-0.1% +static const double classifiers[24] = { + 0.111736, 0.289977, 0.042219, 0.204765, 0.120410, -0.143863, + 0.282376, 0.847811, 0.637161, 0.131570, 0.018636, 0.202134, + 0.112797, 0.028162, 0.182450, 1.124367, 0.386133, 0.083700, + 0.050028, 0.150873, 0.061119, 0.109318, 0.127255, 0.625211 +}; + // This is used as a reference when computing the source variance for the // purpose of activity masking. // Eventually this should be replaced by custom no-reference routines, @@ -98,19 +125,17 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = { }; #endif // CONFIG_VP9_HIGHBITDEPTH -unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi, - const struct buf_2d *ref, - BLOCK_SIZE bs) { +unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref, + BLOCK_SIZE bs) { unsigned int sse; const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse); - return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); + return var; } #if CONFIG_VP9_HIGHBITDEPTH -unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, - const struct buf_2d *ref, - BLOCK_SIZE bs, int bd) { +unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { unsigned int var, sse; switch (bd) { case 10: @@ -130,7 +155,24 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse); break; } - return ROUND64_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]); + return var; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs) { + return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs), + num_pels_log2_lookup[bs]); +} + +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { + return (unsigned int)ROUND64_POWER_OF_TWO( + (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd), + num_pels_log2_lookup[bs]); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -259,8 +301,12 @@ static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x, } typedef struct { - int64_t sum_square_error; - int64_t sum_error; + // This struct is used for computing variance in choose_partitioning(), where + // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even + // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16 + // * 16 = 2^32). + uint32_t sum_square_error; + int32_t sum_error; int log2_count; int variance; } var; @@ -353,7 +399,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { } // Set variance values given sum square error, sum error, count. -static void fill_variance(int64_t s2, int64_t s, int c, var *v) { +static void fill_variance(uint32_t s2, int32_t s, int c, var *v) { v->sum_square_error = s2; v->sum_error = s; v->log2_count = c; @@ -461,24 +507,46 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, return 0; } +static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, + int width, int height, + int content_state) { + if (speed >= 8) { + if (width <= 640 && height <= 480) + return (5 * threshold_base) >> 2; + else if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) + return (5 * threshold_base) >> 2; + } else if (speed == 7) { + if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) { + return (5 * threshold_base) >> 2; + } + } + return threshold_base; +} + // Set the variance split thresholds for following the block sizes: // 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16, // 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is // currently only used on key frame. -static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { +static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, + int content_state) { VP9_COMMON *const cm = &cpi->common; const int is_key_frame = (cm->frame_type == KEY_FRAME); const int threshold_multiplier = is_key_frame ? 20 : 1; int64_t threshold_base = (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]); + if (is_key_frame) { thresholds[0] = threshold_base; thresholds[1] = threshold_base >> 2; thresholds[2] = threshold_base >> 2; thresholds[3] = threshold_base << 2; } else { - // Increase base variance threshold based on estimated noise level. - if (cpi->noise_estimate.enabled) { + // Increase base variance threshold based on estimated noise level. + if (cpi->noise_estimate.enabled && cm->width >= 640 && cm->height >= 480) { NOISE_LEVEL noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate); if (noise_level == kHigh) @@ -488,8 +556,25 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { else if (noise_level < kLow) threshold_base = (7 * threshold_base) >> 3; } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow) + threshold_base = + vp9_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level, + content_state, cpi->svc.temporal_layer_id); + else + threshold_base = + scale_part_thresh_sumdiff(threshold_base, cpi->oxcf.speed, cm->width, + cm->height, content_state); +#else + // Increase base variance threshold based on content_state/sum_diff level. + threshold_base = scale_part_thresh_sumdiff( + threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state); +#endif thresholds[0] = threshold_base; thresholds[2] = threshold_base << cpi->oxcf.speed; + if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7) + thresholds[2] = thresholds[2] << 1; if (cm->width <= 352 && cm->height <= 288) { thresholds[0] = threshold_base >> 3; thresholds[1] = threshold_base >> 1; @@ -504,7 +589,8 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { } } -void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) { +void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q, + int content_state) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; const int is_key_frame = (cm->frame_type == KEY_FRAME); @@ -512,10 +598,11 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) { sf->partition_search_type != REFERENCE_PARTITION) { return; } else { - set_vbp_thresholds(cpi, cpi->vbp_thresholds, q); + set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state); // The thresholds below are not changed locally. if (is_key_frame) { cpi->vbp_threshold_sad = 0; + cpi->vbp_threshold_copy = 0; cpi->vbp_bsize_min = BLOCK_8X8; } else { if (cm->width <= 352 && cm->height <= 288) @@ -525,6 +612,14 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) { ? (cpi->y_dequant[q][1] << 1) : 1000; cpi->vbp_bsize_min = BLOCK_16X16; + if (cm->width <= 352 && cm->height <= 288) + cpi->vbp_threshold_copy = 4000; + else if (cm->width <= 640 && cm->height <= 360) + cpi->vbp_threshold_copy = 8000; + else + cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000 + ? (cpi->y_dequant[q][1] << 3) + : 8000; } cpi->vbp_threshold_minmax = 15 + (q >> 3); } @@ -638,12 +733,14 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, } } -#if !CONFIG_VP9_HIGHBITDEPTH // Check if most of the superblock is skin content, and if so, force split to // 32x32, and set x->sb_is_skin for use in mode selection. static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, int mi_row, int mi_col, int *force_split) { VP9_COMMON *const cm = &cpi->common; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) return 0; +#endif // Avoid checking superblocks on/near boundary and avoid low resolutions. // Note superblock may still pick 64X64 if y_sad is very small // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is. @@ -666,16 +763,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, for (i = 0; i < ymis; i += 2) { for (j = 0; j < xmis; j += 2) { int bl_index = block_index + i * cm->mi_cols + j; - int bl_index1 = bl_index + 1; - int bl_index2 = bl_index + cm->mi_cols; - int bl_index3 = bl_index2 + 1; - int consec_zeromv = - VPXMIN(cpi->consec_zero_mv[bl_index], - VPXMIN(cpi->consec_zero_mv[bl_index1], - VPXMIN(cpi->consec_zero_mv[bl_index2], - cpi->consec_zero_mv[bl_index3]))); - int is_skin = vp9_compute_skin_block( - ysignal, usignal, vsignal, sp, spuv, BLOCK_16X16, consec_zeromv, 0); + int is_skin = cpi->skin_map[bl_index]; num_16x16_skin += is_skin; num_16x16_nonskin += (1 - is_skin); if (num_16x16_nonskin > 3) { @@ -698,7 +786,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, } return 0; } -#endif static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, v64x64 *vt, int64_t thresholds[], @@ -742,9 +829,13 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, continue; if ((*this_mi)->sb_type == BLOCK_32X32) { - if (vt->split[i].part_variances.none.variance < (thresholds[1] >> 1)) + int64_t threshold_32x32 = (cpi->sf.short_circuit_low_temp_var == 1 || + cpi->sf.short_circuit_low_temp_var == 3) + ? ((5 * thresholds[1]) >> 3) + : (thresholds[1] >> 1); + if (vt->split[i].part_variances.none.variance < threshold_32x32) x->variance_low[i + 5] = 1; - } else if (cpi->sf.short_circuit_low_temp_var == 2) { + } else if (cpi->sf.short_circuit_low_temp_var >= 2) { // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block // inside. if ((*this_mi)->sb_type == BLOCK_16X16 || @@ -762,12 +853,297 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, } } +static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x, + MACROBLOCKD *xd, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->prev_partition; + int start_pos = mi_row * cm->mi_stride + mi_col; + + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + partition = partition_lookup[bsl][prev_part[start_pos]]; + subsize = get_subsize(bsize, partition); + + if (subsize < BLOCK_8X8) { + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + } else { + switch (partition) { + case PARTITION_NONE: + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + break; + case PARTITION_HORZ: + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize); + break; + case PARTITION_VERT: + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize); + break; + case PARTITION_SPLIT: + copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs); + break; + default: assert(0); + } + } +} + +static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + int mi_row, int mi_col, int segment_id, + int sb_offset) { + int svc_copy_allowed = 1; + int frames_since_key_thresh = 1; + if (cpi->use_svc) { + // For SVC, don't allow copy if base spatial layer is key frame, or if + // frame is not a temporal enhancement layer frame. + int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0; + frames_since_key_thresh = cpi->svc.number_spatial_layers << 1; + } + if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed && + !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE && + cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE && + cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) { + if (cpi->prev_partition != NULL) { + copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col); + cpi->copied_frame_cnt[sb_offset] += 1; + memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]), + sizeof(x->variance_low)); + return 1; + } + } + + return 0; +} + +static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int mi_row_high, int mi_col_high) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + BLOCK_SIZE *prev_part = svc->prev_partition_svc; + // Variables with _high are for higher resolution. + int bsize_high = 0; + int subsize_high = 0; + const int bsl_high = b_width_log2_lookup[bsize]; + const int bs_high = (1 << bsl_high) >> 2; + const int has_rows = (mi_row_high + bs_high) < cm->mi_rows; + const int has_cols = (mi_col_high + bs_high) < cm->mi_cols; + + const int row_boundary_block_scale_factor[BLOCK_SIZES] = { + 13, 13, 13, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0 + }; + const int col_boundary_block_scale_factor[BLOCK_SIZES] = { + 13, 13, 13, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0 + }; + int start_pos; + BLOCK_SIZE bsize_low; + PARTITION_TYPE partition_high; + + if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0; + if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0; + + // Find corresponding (mi_col/mi_row) block down-scaled by 2x2. + start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col; + bsize_low = prev_part[start_pos]; + // The block size is too big for boundaries. Do variance based partitioning. + if ((!has_rows || !has_cols) && bsize_low > BLOCK_16X16) return 1; + + // For reference frames: return 1 (do variance-based partitioning) if the + // superblock is not low source sad and lower-resoln bsize is below 32x32. + if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad && + bsize_low < BLOCK_32X32) + return 1; + + // Scale up block size by 2x2. Force 64x64 for size larger than 32x32. + if (bsize_low < BLOCK_32X32) { + bsize_high = bsize_low + 3; + } else if (bsize_low >= BLOCK_32X32) { + bsize_high = BLOCK_64X64; + } + // Scale up blocks on boundary. + if (!has_cols && has_rows) { + bsize_high = bsize_low + row_boundary_block_scale_factor[bsize_low]; + } else if (has_cols && !has_rows) { + bsize_high = bsize_low + col_boundary_block_scale_factor[bsize_low]; + } else if (!has_cols && !has_rows) { + bsize_high = bsize_low; + } + + partition_high = partition_lookup[bsl_high][bsize_high]; + subsize_high = get_subsize(bsize, partition_high); + + if (subsize_high < BLOCK_8X8) { + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); + } else { + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + switch (partition_high) { + case PARTITION_NONE: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); + break; + case PARTITION_HORZ: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); + if (subsize_high < BLOCK_64X64) + set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col_high, + subsize_high); + break; + case PARTITION_VERT: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); + if (subsize_high < BLOCK_64X64) + set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high, + subsize_high); + break; + case PARTITION_SPLIT: + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col, + mi_row_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col, mi_row_high + bs_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, + mi_col + (bs >> 1), mi_row_high, + mi_col_high + bs_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col + (bs >> 1), mi_row_high + bs_high, + mi_col_high + bs_high)) + return 1; + break; + default: assert(0); + } + } + + return 0; +} + +static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc; + int start_pos = mi_row * cm->mi_stride + mi_col; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + const MODE_INFO *mi = NULL; + int xx, yy; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + mi = cm->mi_grid_visible[start_pos]; + partition = partition_lookup[bsl][mi->sb_type]; + subsize = get_subsize(bsize, partition); + if (subsize < BLOCK_8X8) { + prev_part[start_pos] = bsize; + } else { + switch (partition) { + case PARTITION_NONE: + prev_part[start_pos] = bsize; + if (bsize == BLOCK_64X64) { + for (xx = 0; xx < 8; xx += 4) + for (yy = 0; yy < 8; yy += 4) { + if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols)) + prev_part[start_pos + xx * cm->mi_stride + yy] = bsize; + } + } + break; + case PARTITION_HORZ: + prev_part[start_pos] = subsize; + if (mi_row + bs < cm->mi_rows) + prev_part[start_pos + bs * cm->mi_stride] = subsize; + break; + case PARTITION_VERT: + prev_part[start_pos] = subsize; + if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; + break; + case PARTITION_SPLIT: + update_partition_svc(cpi, subsize, mi_row, mi_col); + update_partition_svc(cpi, subsize, mi_row + bs, mi_col); + update_partition_svc(cpi, subsize, mi_row, mi_col + bs); + update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs); + break; + default: assert(0); + } + } +} + +static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->prev_partition; + int start_pos = mi_row * cm->mi_stride + mi_col; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + const MODE_INFO *mi = NULL; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + mi = cm->mi_grid_visible[start_pos]; + partition = partition_lookup[bsl][mi->sb_type]; + subsize = get_subsize(bsize, partition); + if (subsize < BLOCK_8X8) { + prev_part[start_pos] = bsize; + } else { + switch (partition) { + case PARTITION_NONE: prev_part[start_pos] = bsize; break; + case PARTITION_HORZ: + prev_part[start_pos] = subsize; + if (mi_row + bs < cm->mi_rows) + prev_part[start_pos + bs * cm->mi_stride] = subsize; + break; + case PARTITION_VERT: + prev_part[start_pos] = subsize; + if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; + break; + case PARTITION_SPLIT: + update_prev_partition_helper(cpi, subsize, mi_row, mi_col); + update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col); + update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs); + update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs); + break; + default: assert(0); + } + } +} + +static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id, + int mi_row, int mi_col, int sb_offset) { + update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col); + cpi->prev_segment_id[sb_offset] = segment_id; + memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low, + sizeof(x->variance_low)); + // Reset the counter for copy partitioning + cpi->copied_frame_cnt[sb_offset] = 0; +} + static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, unsigned int y_sad, int is_key_frame) { int i; MACROBLOCKD *xd = &x->e_mbd; + if (is_key_frame) return; + // For speed >= 8, avoid the chroma check if y_sad is above threshold. + if (cpi->oxcf.speed >= 8) { + if (y_sad > cpi->vbp_thresholds[1] && + (!cpi->noise_estimate.enabled || + vp9_noise_estimate_extract_level(&cpi->noise_estimate) < kMedium)) + return; + } + for (i = 1; i <= 2; ++i) { unsigned int uv_sad = UINT_MAX; struct macroblock_plane *p = &x->plane[i]; @@ -784,6 +1160,55 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, } } +static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, + int sb_offset) { + unsigned int tmp_sse; + uint64_t tmp_sad; + unsigned int tmp_variance; + const BLOCK_SIZE bsize = BLOCK_64X64; + uint8_t *src_y = cpi->Source->y_buffer; + int src_ystride = cpi->Source->y_stride; + uint8_t *last_src_y = cpi->Last_Source->y_buffer; + int last_src_ystride = cpi->Last_Source->y_stride; + uint64_t avg_source_sad_threshold = 10000; + uint64_t avg_source_sad_threshold2 = 12000; +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) return 0; +#endif + src_y += shift; + last_src_y += shift; + tmp_sad = + cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride); + tmp_variance = vpx_variance64x64(src_y, src_ystride, last_src_y, + last_src_ystride, &tmp_sse); + // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) + if (tmp_sad < avg_source_sad_threshold) + x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kLowSadLowSumdiff + : kLowSadHighSumdiff; + else + x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff + : kHighSadHighSumdiff; + + // Detect large lighting change. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) && + (tmp_sse - tmp_variance) > 10000) + x->content_state_sb = kLowVarHighSumdiff; + else if (tmp_sad > (avg_source_sad_threshold << 1)) + x->content_state_sb = kVeryHighSad; + + if (cpi->content_state_sb_fd != NULL) { + if (tmp_sad < avg_source_sad_threshold2) { + // Cap the increment to 255. + if (cpi->content_state_sb_fd[sb_offset] < 255) + cpi->content_state_sb_fd[sb_offset]++; + } else { + cpi->content_state_sb_fd[sb_offset] = 0; + } + } + return tmp_sad; +} + // This function chooses partitioning based on the variance between source and // reconstructed last, where variance is computed for down-sampled inputs. static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, @@ -792,14 +1217,23 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, MACROBLOCKD *xd = &x->e_mbd; int i, j, k, m; v64x64 vt; - v16x16 vt2[16]; + v16x16 *vt2 = NULL; int force_split[21]; int avg_32x32; + int max_var_32x32 = 0; + int min_var_32x32 = INT_MAX; + int var_32x32; int avg_16x16[4]; + int maxvar_16x16[4]; + int minvar_16x16[4]; + int64_t threshold_4x4avg; + NOISE_LEVEL noise_level = kLow; + int content_state = 0; uint8_t *s; const uint8_t *d; int sp; int dp; + int compute_minmax_variance = 1; unsigned int y_sad = UINT_MAX; BLOCK_SIZE bsize = BLOCK_64X64; // Ref frame used in partitioning. @@ -819,17 +1253,61 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, const int low_res = (cm->width <= 352 && cm->height <= 288); int variance4x4downsample[16]; int segment_id; + int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3); set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); segment_id = xd->mi[0]->segment_id; - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { - if (cyclic_refresh_segment_id_boosted(segment_id)) { - int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - set_vbp_thresholds(cpi, thresholds, q); + + if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame)) + compute_minmax_variance = 0; + + memset(x->variance_low, 0, sizeof(x->variance_low)); + + if (cpi->sf.use_source_sad && !is_key_frame) { + int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + content_state = x->content_state_sb; + x->skip_low_source_sad = (content_state == kLowSadLowSumdiff || + content_state == kLowSadHighSumdiff) + ? 1 + : 0; + x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0; + if (cpi->content_state_sb_fd != NULL) + x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2]; + + // For SVC on top spatial layer: use/scale the partition from + // the lower spatial resolution if svc_use_lowres_part is enabled. + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) { + if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1, + mi_col >> 1, mi_row, mi_col)) { + if (cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } + return 0; + } + } + // If source_sad is low copy the partition without computing the y_sad. + if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && + copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { + x->sb_use_mv_part = 1; + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + return 0; } } - memset(x->variance_low, 0, sizeof(x->variance_low)); + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + cyclic_refresh_segment_id_boosted(segment_id)) { + int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + set_vbp_thresholds(cpi, thresholds, q, content_state); + } else { + set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state); + } + + // For non keyframes, disable 4x4 average for low resolution when speed = 8 + threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX; if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); @@ -846,10 +1324,10 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // that the temporal reference frame will always be of type LAST_FRAME. // TODO(marpan): If that assumption is broken, we need to revisit this code. MODE_INFO *mi = xd->mi[0]; - const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); const YV12_BUFFER_CONFIG *yv12_g = NULL; - unsigned int y_sad_g, y_sad_thr; + unsigned int y_sad_g, y_sad_thr, y_sad_last; bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows); @@ -861,7 +1339,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); } - if (yv12_g && yv12_g != yv12 && (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + // Only compute y_sad_g (sad for golden reference) for speed < 8. + if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 && + (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, &cm->frame_refs[GOLDEN_FRAME - 1].sf); y_sad_g = cpi->fn_ptr[bsize].sdf( @@ -871,15 +1351,36 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, y_sad_g = UINT_MAX; } - vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, - &cm->frame_refs[LAST_FRAME - 1].sf); - mi->ref_frame[0] = LAST_FRAME; + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) { + yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ALTREF_FRAME - 1].sf); + mi->ref_frame[0] = ALTREF_FRAME; + y_sad_g = UINT_MAX; + } else { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mi->ref_frame[0] = LAST_FRAME; + } mi->ref_frame[1] = NONE; mi->sb_type = BLOCK_64X64; mi->mv[0].as_int = 0; mi->interp_filter = BILINEAR; - y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + if (cpi->oxcf.speed >= 8 && !low_res && + x->content_state_sb != kVeryHighSad) { + y_sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + } + + y_sad_last = y_sad; // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad // are close if short_circuit_low_temp_var is on. y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad; @@ -898,12 +1399,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); - x->sb_is_skin = 0; -#if !CONFIG_VP9_HIGHBITDEPTH if (cpi->use_skin_detection) x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split); -#endif d = xd->plane[0].dst.buf; dp = xd->plane[0].dst.stride; @@ -916,10 +1414,30 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows) { set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64); + x->variance_low[0] = 1; chroma_check(cpi, x, bsize, y_sad, is_key_frame); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + if (cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } return 0; } } + + // If the y_sad is small enough, copy the partition of the superblock in the + // last frame to current frame only if the last frame is not a keyframe. + // Stop the copy every cpi->max_copied_frame to refresh the partition. + // TODO(jianj) : tune the threshold. + if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy && + copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { + chroma_check(cpi, x, bsize, y_sad, is_key_frame); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + return 0; + } } else { d = VP9_VAR_OFFS; dp = 0; @@ -935,6 +1453,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, #endif // CONFIG_VP9_HIGHBITDEPTH } + if (low_res && threshold_4x4avg < INT64_MAX) + CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2))); // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances // for splits. for (i = 0; i < 4; i++) { @@ -943,6 +1463,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, const int i2 = i << 2; force_split[i + 1] = 0; avg_16x16[i] = 0; + maxvar_16x16[i] = 0; + minvar_16x16[i] = INT_MAX; for (j = 0; j < 4; j++) { const int x16_idx = x32_idx + ((j & 1) << 4); const int y16_idx = y32_idx + ((j >> 1) << 4); @@ -959,13 +1481,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); get_variance(&vt.split[i].split[j].part_variances.none); avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance; + if (vt.split[i].split[j].part_variances.none.variance < minvar_16x16[i]) + minvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance; + if (vt.split[i].split[j].part_variances.none.variance > maxvar_16x16[i]) + maxvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance; if (vt.split[i].split[j].part_variances.none.variance > thresholds[2]) { // 16X16 variance is above threshold for split, so force split to 8x8 // for this 16x16 block (this also forces splits for upper levels). force_split[split_index] = 1; force_split[i + 1] = 1; force_split[0] = 1; - } else if (cpi->oxcf.speed < 8 && + } else if (compute_minmax_variance && vt.split[i].split[j].part_variances.none.variance > thresholds[1] && !cyclic_refresh_segment_id_boosted(segment_id)) { @@ -977,7 +1503,10 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, xd->cur_buf->flags, #endif pixels_wide, pixels_high); - if (minmax > cpi->vbp_threshold_minmax) { + int thresh_minmax = (int)cpi->vbp_threshold_minmax; + if (x->content_state_sb == kVeryHighSad) + thresh_minmax = thresh_minmax << 1; + if (minmax > thresh_minmax) { force_split[split_index] = 1; force_split[i + 1] = 1; force_split[0] = 1; @@ -986,7 +1515,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } if (is_key_frame || (low_res && vt.split[i].split[j].part_variances.none.variance > - (thresholds[1] << 1))) { + threshold_4x4avg)) { force_split[split_index] = 0; // Go down to 4x4 down-sampling for variance. variance4x4downsample[i2 + j] = 1; @@ -1003,6 +1532,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } } + if (cpi->noise_estimate.enabled) + noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate); // Fill the rest of the variance tree by summing split partition values. avg_32x32 = 0; for (i = 0; i < 4; i++) { @@ -1029,14 +1560,22 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // (64x64) level. if (!force_split[i + 1]) { get_variance(&vt.split[i].part_variances.none); + var_32x32 = vt.split[i].part_variances.none.variance; + max_var_32x32 = VPXMAX(var_32x32, max_var_32x32); + min_var_32x32 = VPXMIN(var_32x32, min_var_32x32); if (vt.split[i].part_variances.none.variance > thresholds[1] || (!is_key_frame && vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) && vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) { force_split[i + 1] = 1; force_split[0] = 1; + } else if (!is_key_frame && noise_level < kLow && cm->height <= 360 && + (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[1] >> 1) && + maxvar_16x16[i] > thresholds[1]) { + force_split[i + 1] = 1; + force_split[0] = 1; } - avg_32x32 += vt.split[i].part_variances.none.variance; + avg_32x32 += var_32x32; } } if (!force_split[0]) { @@ -1044,7 +1583,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, get_variance(&vt.part_variances.none); // If variance of this 64x64 block is above (some threshold of) the average // variance over the sub-32x32 blocks, then force this block to split. - if (!is_key_frame && vt.part_variances.none.variance > (5 * avg_32x32) >> 4) + // Only checking this for noise level >= medium for now. + if (!is_key_frame && noise_level >= kMedium && + vt.part_variances.none.variance > (9 * avg_32x32) >> 5) + force_split[0] = 1; + // Else if the maximum 32x32 variance minus the miniumum 32x32 variance in + // a 64x64 block is greater than threshold and the maximum 32x32 variance is + // above a miniumum threshold, then force the split of a 64x64 block + // Only check this for low noise. + else if (!is_key_frame && noise_level < kMedium && + (max_var_32x32 - min_var_32x32) > 3 * (thresholds[0] >> 3) && + max_var_32x32 > thresholds[0] >> 1) force_split[0] = 1; } @@ -1099,12 +1648,21 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } + if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } + + if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + if (cpi->sf.short_circuit_low_temp_var) { set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition, mi_col, mi_row); } chroma_check(cpi, x, bsize, y_sad, is_key_frame); + if (vt2) vpx_free(vt2); return 0; } @@ -1863,7 +2421,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, } } - if (cm->use_prev_frame_mvs || + if (cm->use_prev_frame_mvs || !cm->error_resilient_mode || (cpi->svc.use_base_mv && cpi->svc.number_spatial_layers > 1 && cpi->svc.spatial_layer_id != cpi->svc.number_spatial_layers - 1)) { MV_REF *const frame_mvs = @@ -2460,6 +3018,74 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, } #endif +// Calculate the score used in machine-learning based partition search early +// termination. +static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const double *clf; + const double *mean; + const double *sd; + const int mag_mv = + abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row); + const int left_in_image = !!xd->left_mi; + const int above_in_image = !!xd->above_mi; + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row]; + int above_par = 0; // above_partitioning + int left_par = 0; // left_partitioning + int last_par = 0; // last_partitioning + BLOCK_SIZE context_size; + double score; + int offset = 0; + + assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); + + if (above_in_image) { + context_size = xd->above_mi->sb_type; + if (context_size < bsize) + above_par = 2; + else if (context_size == bsize) + above_par = 1; + } + + if (left_in_image) { + context_size = xd->left_mi->sb_type; + if (context_size < bsize) + left_par = 2; + else if (context_size == bsize) + left_par = 1; + } + + if (prev_mi) { + context_size = prev_mi[0]->sb_type; + if (context_size < bsize) + last_par = 2; + else if (context_size == bsize) + last_par = 1; + } + + if (bsize == BLOCK_64X64) + offset = 0; + else if (bsize == BLOCK_32X32) + offset = 8; + else if (bsize == BLOCK_16X16) + offset = 16; + + // early termination score calculation + clf = &classifiers[offset]; + mean = &train_mean[offset]; + sd = &train_stdm[offset]; + score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) + + clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) + + clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) + + clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * sd[3]) + + clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) + + clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) + + clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7]; + return score; +} + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. @@ -2505,8 +3131,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, int partition_vert_allowed = !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; - int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr; - int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr; + int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist; + int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate; (void)*tp_orig; @@ -2636,18 +3262,34 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (this_rdc.rdcost < best_rdc.rdcost) { + MODE_INFO *mi = xd->mi[0]; + best_rdc = this_rdc; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - // If all y, u, v transform blocks in this partition are skippable, and - // the dist & rate are within the thresholds, the partition search is - // terminated for current branch of the partition search tree. - if (!x->e_mbd.lossless && ctx->skippable && - ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { - do_split = 0; - do_rect = 0; + if (!cpi->sf.ml_partition_search_early_termination) { + // If all y, u, v transform blocks in this partition are skippable, + // and the dist & rate are within the thresholds, the partition search + // is terminated for current branch of the partition search tree. + if (!x->e_mbd.lossless && ctx->skippable && + ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr))) { + do_split = 0; + do_rect = 0; + } + } else { + // Currently, the machine-learning based partition search early + // termination is only used while bsize is 16x16, 32x32 or 64x64, + // VPXMIN(cm->width, cm->height) >= 480, and speed = 0. + if (!x->e_mbd.lossless && + !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) && + ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) { + if (compute_score(cm, xd, ctx, mi_row, mi_col, bsize) < 0.0) { + do_split = 0; + do_rect = 0; + } + } } #if CONFIG_FP_MB_STATS @@ -2760,7 +3402,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, pc_tree->partitioning = PARTITION_SPLIT; // Rate and distortion based partition search termination clause. - if (!x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) || + if (!cpi->sf.ml_partition_search_early_termination && + !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) || (best_rdc.dist < dist_breakout_thr && best_rdc.rate < rate_breakout_thr))) { do_rect = 0; @@ -2903,13 +3546,18 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, const int mi_col_start = tile_info->mi_col_start; const int mi_col_end = tile_info->mi_col_end; int mi_col; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int num_sb_cols = + get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2); + int sb_col_in_tile; // Initialize the left context for the new SB row memset(&xd->left_context, 0, sizeof(xd->left_context)); memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); // Code each SB in the row - for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) { const struct segmentation *const seg = &cm->seg; int dummy_rate; int64_t dummy_dist; @@ -2920,6 +3568,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, const int idx_str = cm->mi_stride * mi_row + mi_col; MODE_INFO **mi = cm->mi_grid_visible + idx_str; + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile); + if (sf->adaptive_pred_interp_filter) { for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE; @@ -2971,6 +3622,8 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, INT64_MAX, td->pc_root); } + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile, num_sb_cols); } } @@ -3044,7 +3697,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) { static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { - if (bsize < BLOCK_16X16) + if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16) vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); else vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); @@ -3208,6 +3861,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; (void)*tp_orig; + // Avoid checking for rectangular partitions for speed >= 6. + if (cpi->oxcf.speed >= 6) do_rect = 0; + assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); @@ -3251,8 +3907,8 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); if (this_rdc.rdcost < best_rdc.rdcost) { - int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr; - int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr; + int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; + int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); @@ -3435,6 +4091,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, PARTITION_TYPE partition; BLOCK_SIZE subsize; RD_COST this_rdc; + BLOCK_SIZE subsize_ref = + (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16; vp9_rd_cost_reset(&this_rdc); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -3448,7 +4106,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, 0, INT64_MAX, pc_tree); } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE && - subsize >= BLOCK_16X16) { + subsize >= subsize_ref) { x->max_partition_size = BLOCK_32X32; x->min_partition_size = BLOCK_8X8; nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, @@ -3673,13 +4331,18 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, const int mi_col_start = tile_info->mi_col_start; const int mi_col_end = tile_info->mi_col_end; int mi_col; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int num_sb_cols = + get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2); + int sb_col_in_tile; // Initialize the left context for the new SB row memset(&xd->left_context, 0, sizeof(xd->left_context)); memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); // Code each SB in the row - for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE, ++sb_col_in_tile) { const struct segmentation *const seg = &cm->seg; RD_COST dummy_rdc; const int idx_str = cm->mi_stride * mi_row + mi_col; @@ -3687,12 +4350,29 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type; BLOCK_SIZE bsize = BLOCK_64X64; int seg_skip = 0; + + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile); + + if (cpi->use_skin_detection) { + vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col); + } + x->source_variance = UINT_MAX; vp9_zero(x->pred_mv); vp9_rd_cost_init(&dummy_rdc); x->color_sensitivity[0] = 0; x->color_sensitivity[1] = 0; x->sb_is_skin = 0; + x->skip_low_source_sad = 0; + x->lowvar_highsumdiff = 0; + x->content_state_sb = 0; + x->sb_use_mv_part = 0; + x->sb_mvcol_part = 0; + x->sb_mvrow_part = 0; + x->sb_pickmode_part = 0; + x->arf_frame_usage = 0; + x->lastgolden_frame_usage = 0; if (seg->enabled) { const uint8_t *const map = @@ -3704,6 +4384,17 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } } + if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { + int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3); + int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2); + if (sf->adapt_partition_source_sad && + (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref && + source_sad > sf->adapt_partition_thresh && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) + partition_search_type = REFERENCE_PARTITION; + } + // Set the partition type of the 64X64 block switch (partition_search_type) { case VAR_BASED_PARTITION: @@ -3727,15 +4418,14 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; case REFERENCE_PARTITION: + x->sb_pickmode_part = 1; set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); - // Use nonrd_pick_partition on scene-cut for VBR, or on qp-segment - // if cyclic_refresh is enabled. + // Use nonrd_pick_partition on scene-cut for VBR mode. // nonrd_pick_partition does not support 4x4 partition, so avoid it // on key frame for now. if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad && - cm->frame_type != KEY_FRAME) || - (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && - xd->mi[0]->segment_id)) { + cpi->oxcf.speed < 6 && cm->frame_type != KEY_FRAME && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { // Use lower max_partition_size for low resoultions. if (cm->width <= 352 && cm->height <= 288) x->max_partition_size = BLOCK_32X32; @@ -3761,10 +4451,35 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, break; default: assert(0); break; } + + // Update ref_frame usage for inter frame if this group is ARF group. + if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame && + !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group && + cpi->sf.use_altref_onepass) { + int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + if (cpi->count_arf_frame_usage != NULL) + cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage; + if (cpi->count_lastgolden_frame_usage != NULL) + cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage; + } + + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile, num_sb_cols); } } // end RTC play code +static INLINE uint32_t variance(const diff *const d) { + return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE uint32_t variance_highbd(diff *const d) { + const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8); + return (var >= 0) ? (uint32_t)var : 0; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static int set_var_thresh_from_histogram(VP9_COMP *cpi) { const SPEED_FEATURES *const sf = &cpi->sf; const VP9_COMMON *const cm = &cpi->common; @@ -3794,14 +4509,17 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { case VPX_BITS_8: vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance(var16); break; case VPX_BITS_10: vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance_highbd(var16); break; case VPX_BITS_12: vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance_highbd(var16); break; default: assert(0 && @@ -3812,12 +4530,13 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { } else { vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance(var16); } #else vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); + var16->var = variance(var16); #endif // CONFIG_VP9_HIGHBITDEPTH - var16->var = var16->sse - (((uint32_t)var16->sum * var16->sum) >> 8); if (var16->var >= VAR_HIST_MAX_BG_VAR) hist[VAR_HIST_BINS - 1]++; @@ -3895,7 +4614,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { const int tile_rows = 1 << cm->log2_tile_rows; int tile_col, tile_row; TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; + TOKENLIST *tplist = cpi->tplist[0][0]; int tile_tok = 0; + int tplist_count = 0; if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { if (cpi->tile_data != NULL) vpx_free(cpi->tile_data); @@ -3910,51 +4631,75 @@ void vp9_init_tile_data(VP9_COMP *cpi) { int i, j; for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact[i][j] = 32; + tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; tile_data->mode_map[i][j] = j; } } +#if CONFIG_MULTITHREAD + tile_data->row_base_thresh_freq_fact = NULL; +#endif } } for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileInfo *tile_info = - &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *tile_info = &this_tile->tile_info; vp9_tile_init(tile_info, cm, tile_row, tile_col); cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; pre_tok = cpi->tile_tok[tile_row][tile_col]; tile_tok = allocated_tokens(*tile_info); + + cpi->tplist[tile_row][tile_col] = tplist + tplist_count; + tplist = cpi->tplist[tile_row][tile_col]; + tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); } } } +void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row, + int tile_col, int mi_row) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TOKENEXTRA *tok = NULL; + int tile_sb_row; + int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1; + + tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile_info->mi_row_start) >> + MI_BLOCK_SIZE_LOG2; + get_start_tok(cpi, tile_row, tile_col, mi_row, &tok); + cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok; + + if (cpi->sf.use_nonrd_pick_mode) + encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); + else + encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); + + cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok; + cpi->tplist[tile_row][tile_col][tile_sb_row].count = + (unsigned int)(cpi->tplist[tile_row][tile_col][tile_sb_row].stop - + cpi->tplist[tile_row][tile_col][tile_sb_row].start); + assert(tok - cpi->tplist[tile_row][tile_col][tile_sb_row].start <= + get_token_alloc(MI_BLOCK_SIZE >> 1, tile_mb_cols)); + + (void)tile_mb_cols; +} + void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row, int tile_col) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; const TileInfo *const tile_info = &this_tile->tile_info; - TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; const int mi_row_start = tile_info->mi_row_start; const int mi_row_end = tile_info->mi_row_end; int mi_row; - // Set up pointers to per thread motion search counters. - td->mb.m_search_count_ptr = &td->rd_counts.m_search_count; - td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count; - - for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) { - if (cpi->sf.use_nonrd_pick_mode) - encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); - else - encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); - } - cpi->tok_count[tile_row][tile_col] = - (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]); - assert(tok - cpi->tile_tok[tile_row][tile_col] <= - allocated_tokens(*tile_info)); + for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) + vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); } static void encode_tiles(VP9_COMP *cpi) { @@ -4002,15 +4747,15 @@ static void encode_frame_internal(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; + x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; else - x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; - x->highbd_itxm_add = + x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; + x->highbd_inv_txfm_add = xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add; #else - x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; + x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH - x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; if (xd->lossless) x->optimize = 0; @@ -4048,6 +4793,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(x->zcoeff_blk); if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0 && + !(cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) && !cpi->use_svc) cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); @@ -4066,11 +4812,20 @@ static void encode_frame_internal(VP9_COMP *cpi) { } #endif - // If allowed, encoding tiles in parallel with one thread handling one tile. - if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1) - vp9_encode_tiles_mt(cpi); - else - encode_tiles(cpi); + if (!cpi->row_mt) { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; + // If allowed, encoding tiles in parallel with one thread handling one + // tile when row based multi-threading is disabled. + if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1) + vp9_encode_tiles_mt(cpi); + else + encode_tiles(cpi); + } else { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; + vp9_encode_tiles_row_mt(cpi); + } vpx_usec_timer_mark(&emr_timer); cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); @@ -4243,8 +4998,31 @@ void vp9_encode_frame(VP9_COMP *cpi) { } } } else { + FRAME_COUNTS *counts = cpi->td.counts; cm->reference_mode = SINGLE_REFERENCE; + if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode && + cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref && + cm->frame_type != KEY_FRAME) + cm->reference_mode = REFERENCE_MODE_SELECT; + encode_frame_internal(cpi); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + int single_count_zero = 0; + int comp_count_zero = 0; + int i; + for (i = 0; i < COMP_INTER_CONTEXTS; i++) { + single_count_zero += counts->comp_inter[i][0]; + comp_count_zero += counts->comp_inter[i][1]; + } + if (comp_count_zero == 0) { + cm->reference_mode = SINGLE_REFERENCE; + vp9_zero(counts->comp_inter); + } else if (single_count_zero == 0) { + cm->reference_mode = COMPOUND_REFERENCE; + vp9_zero(counts->comp_inter); + } + } } // If segmented AQ is enabled compute the average AQ weighting. diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.h b/libs/libvpx/vp9/encoder/vp9_encodeframe.h index aa54947858..cf5ae3d8ac 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.h @@ -39,7 +39,11 @@ void vp9_init_tile_data(struct VP9_COMP *cpi); void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); -void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q); +void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); + +void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q, + int content_state); #ifdef __cplusplus } // extern "C" diff --git a/libs/libvpx/vp9/encoder/vp9_encodemb.c b/libs/libvpx/vp9/encoder/vp9_encodemb.c index 20ebe68197..f3c17f2559 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libs/libvpx/vp9/encoder/vp9_encodemb.c @@ -49,36 +49,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { pd->dst.buf, pd->dst.stride); } -typedef struct vp9_token_state { - int64_t error; - int rate; - int16_t next; - int16_t token; - tran_low_t qc; - tran_low_t dqc; - uint8_t best_index; -} vp9_token_state; - static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { { 10, 6 }, { 8, 5 }, }; -#define UPDATE_RD_COST() \ - { \ - rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \ - rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \ - } - -// This function is a place holder for now but may ultimately need -// to scan previous tokens to work out the correct context. -static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb, - int idx, int token, uint8_t *token_cache) { - int bak = token_cache[scan[idx]], pt; - token_cache[scan[idx]] = vp9_pt_energy_class[token]; - pt = get_coef_context(nb, token_cache, idx + 1); - token_cache[scan[idx]] = bak; - return pt; -} +// 'num' can be negative, but 'shift' must be non-negative. +#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \ + ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift)) int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int ctx) { @@ -86,232 +63,244 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, struct macroblock_plane *const p = &mb->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; const int ref = is_inter_block(xd->mi[0]); - vp9_token_state tokens[1025][2]; uint8_t token_cache[1024]; - const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int eob = p->eobs[block]; - const PLANE_TYPE type = get_plane_type(plane); + const PLANE_TYPE plane_type = get_plane_type(plane); const int default_eob = 16 << (tx_size << 1); const int shift = (tx_size == TX_32X32); const int16_t *const dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); - const scan_order *const so = get_scan(xd, tx_size, type, block); + const scan_order *const so = get_scan(xd, tx_size, plane_type, block); const int16_t *const scan = so->scan; const int16_t *const nb = so->neighbors; - const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift }; - int next = eob, sz = 0; - const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1; + const int64_t rdmult = + ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1; const int64_t rddiv = mb->rddiv; int64_t rd_cost0, rd_cost1; - int rate0, rate1; - int64_t error0, error1; + int64_t rate0, rate1; int16_t t0, t1; - EXTRABIT e0; - int best, band, pt, i, final_eob; + int i, final_eob; #if CONFIG_VP9_HIGHBITDEPTH - const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd); + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); #else - const int *cat6_high_cost = vp9_get_high_cost_table(8); + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); #endif + unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[tx_size][plane_type][ref]; + unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS]; + int64_t eob_cost0, eob_cost1; + const int ctx0 = ctx; + int64_t accu_rate = 0; + // Initialized to the worst possible error for the largest transform size. + // This ensures that it never goes negative. + int64_t accu_error = ((int64_t)1) << 50; + int64_t best_block_rd_cost = INT64_MAX; + int x_prev = 1; + tran_low_t before_best_eob_qc = 0; + tran_low_t before_best_eob_dqc = 0; - assert((!type && !plane) || (type && plane)); + assert((!plane_type && !plane) || (plane_type && plane)); assert(eob <= default_eob); - /* Now set up a Viterbi trellis to evaluate alternative roundings. */ - /* Initialize the sentinel node of the trellis. */ - tokens[eob][0].rate = 0; - tokens[eob][0].error = 0; - tokens[eob][0].next = default_eob; - tokens[eob][0].token = EOB_TOKEN; - tokens[eob][0].qc = 0; - tokens[eob][1] = tokens[eob][0]; - - for (i = 0; i < eob; i++) - token_cache[scan[i]] = vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])]; - - for (i = eob; i-- > 0;) { - int base_bits, d2, dx; + for (i = 0; i < eob; i++) { const int rc = scan[i]; - int x = qcoeff[rc]; - /* Only add a trellis state for non-zero coefficients. */ - if (x) { - int shortcut = 0; - error0 = tokens[next][0].error; - error1 = tokens[next][1].error; - /* Evaluate the first possibility for this state. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - vp9_get_token_extra(x, &t0, &e0); - /* Consider both possible successor states. */ - if (next < default_eob) { - band = band_translate[i + 1]; - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][0].token]; - rate1 += mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][1].token]; - } - UPDATE_RD_COST(); - /* And pick the best. */ - best = rd_cost1 < rd_cost0; - base_bits = vp9_get_cost(t0, e0, cat6_high_cost); - dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift); -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx >>= xd->bd - 8; - } -#endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; - tokens[i][0].rate = base_bits + (best ? rate1 : rate0); - tokens[i][0].error = d2 + (best ? error1 : error0); - tokens[i][0].next = next; - tokens[i][0].token = t0; - tokens[i][0].qc = x; - tokens[i][0].dqc = dqcoeff[rc]; - tokens[i][0].best_index = best; + token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])]; + } + final_eob = 0; - /* Evaluate the second possibility for this state. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; + // Initial RD cost. + token_costs_cur = token_costs + band_translate[0]; + rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN]; + best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error); - if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) && - (abs(x) * dequant_ptr[rc != 0] < - (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) - shortcut = 1; - else - shortcut = 0; - - if (shortcut) { - sz = -(x < 0); - x -= 2 * sz + 1; - } else { - tokens[i][1] = tokens[i][0]; - next = i; - continue; - } - - /* Consider both possible successor states. */ - if (!x) { - /* If we reduced this coefficient to zero, check to see if - * we need to move the EOB back here. - */ - t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; - t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; - e0 = 0; - } else { - vp9_get_token_extra(x, &t0, &e0); - t1 = t0; - } - if (next < default_eob) { - band = band_translate[i + 1]; - if (t0 != EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt] - [tokens[next][0].token]; - } - if (t1 != EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); - rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt] - [tokens[next][1].token]; - } - } - - UPDATE_RD_COST(); - /* And pick the best. */ - best = rd_cost1 < rd_cost0; - base_bits = vp9_get_cost(t0, e0, cat6_high_cost); - - if (shortcut) { -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz; - } else { - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; - } -#else - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; -#endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; - } - tokens[i][1].rate = base_bits + (best ? rate1 : rate0); - tokens[i][1].error = d2 + (best ? error1 : error0); - tokens[i][1].next = next; - tokens[i][1].token = best ? t1 : t0; - tokens[i][1].qc = x; - - if (x) { - tran_low_t offset = dq_step[rc != 0]; - // The 32x32 transform coefficient uses half quantization step size. - // Account for the rounding difference in the dequantized coefficeint - // value when the quantization index is dropped from an even number - // to an odd number. - if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01); - - if (sz == 0) - tokens[i][1].dqc = dqcoeff[rc] - offset; - else - tokens[i][1].dqc = dqcoeff[rc] + offset; - } else { - tokens[i][1].dqc = 0; - } - - tokens[i][1].best_index = best; - /* Finally, make this the new head of the trellis. */ - next = i; + // For each token, pick one of two choices greedily: + // (i) First candidate: Keep current quantized value, OR + // (ii) Second candidate: Reduce quantized value by 1. + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + const int x = qcoeff[rc]; + const int band_cur = band_translate[i]; + const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i); + const int token_tree_sel_cur = (x_prev == 0); + token_costs_cur = token_costs + band_cur; + if (x == 0) { // No need to search + const int token = vp9_get_token(x); + rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token]; + accu_rate += rate0; + x_prev = 0; + // Note: accu_error does not change. } else { - /* There's no choice to make for a zero coefficient, so we don't - * add a new trellis node, but we do need to update the costs. - */ - band = band_translate[i + 1]; - pt = get_coef_context(nb, token_cache, i + 1); - t0 = tokens[next][0].token; - t1 = tokens[next][1].token; - /* Update the cost of each path if we're past the EOB token. */ - if (t0 != EOB_TOKEN) { - tokens[next][0].rate += - mb->token_costs[tx_size][type][ref][band][1][pt][t0]; - tokens[next][0].token = ZERO_TOKEN; + const int dqv = dequant_ptr[rc != 0]; + // Compute the distortion for quantizing to 0. + const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift); + const int diff_for_zero = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8) + : +#endif + diff_for_zero_raw; + const int64_t distortion_for_zero = + (int64_t)diff_for_zero * diff_for_zero; + + // Compute the distortion for the first candidate + const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift); + const int diff0 = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + diff0_raw; + const int64_t distortion0 = (int64_t)diff0 * diff0; + + // Compute the distortion for the second candidate + const int sign = -(x < 0); // -1 if x is negative and 0 otherwise. + const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1. + int64_t distortion1; + if (x1 != 0) { + const int dqv_step = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + dqv; + const int diff_step = (dqv_step + sign) ^ sign; + const int diff1 = diff0 - diff_step; + assert(dqv > 0); // We aren't right shifting a negative number above. + distortion1 = (int64_t)diff1 * diff1; + } else { + distortion1 = distortion_for_zero; } - if (t1 != EOB_TOKEN) { - tokens[next][1].rate += - mb->token_costs[tx_size][type][ref][band][1][pt][t1]; - tokens[next][1].token = ZERO_TOKEN; + { + // Calculate RDCost for current coeff for the two candidates. + const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost); + const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost); + rate0 = + base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0]; + rate1 = + base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1]; + } + { + int rdcost_better_for_x1, eob_rdcost_better_for_x1; + int dqc0, dqc1; + int64_t best_eob_cost_cur; + int use_x1; + + // Calculate RD Cost effect on the next coeff for the two candidates. + int64_t next_bits0 = 0; + int64_t next_bits1 = 0; + int64_t next_eob_bits0 = 0; + int64_t next_eob_bits1 = 0; + if (i < default_eob - 1) { + int ctx_next, token_tree_sel_next; + const int band_next = band_translate[i + 1]; + const int token_next = + (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN; + unsigned int( + *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + token_costs + band_next; + token_cache[rc] = vp9_pt_energy_class[t0]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x == 0); + next_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; + next_eob_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + token_cache[rc] = vp9_pt_energy_class[t1]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x1 == 0); + next_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; + if (x1 != 0) { + next_eob_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + } + } + + // Compare the total RD costs for two candidates. + rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0); + rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1); + rdcost_better_for_x1 = (rd_cost1 < rd_cost0); + eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0), + (accu_error + distortion0 - distortion_for_zero)); + eob_cost1 = eob_cost0; + if (x1 != 0) { + eob_cost1 = + RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1), + (accu_error + distortion1 - distortion_for_zero)); + eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0); + } else { + eob_rdcost_better_for_x1 = 0; + } + + // Calculate the two candidate de-quantized values. + dqc0 = dqcoeff[rc]; + dqc1 = 0; + if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) { + if (x1 != 0) { + dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift); + } else { + dqc1 = 0; + } + } + + // Pick and record the better quantized and de-quantized values. + if (rdcost_better_for_x1) { + qcoeff[rc] = x1; + dqcoeff[rc] = dqc1; + accu_rate += rate1; + accu_error += distortion1 - distortion_for_zero; + assert(distortion1 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t1]; + } else { + accu_rate += rate0; + accu_error += distortion0 - distortion_for_zero; + assert(distortion0 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t0]; + } + assert(accu_error >= 0); + x_prev = qcoeff[rc]; // Update based on selected quantized value. + + use_x1 = (x1 != 0) && eob_rdcost_better_for_x1; + best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0; + + // Determine whether to move the eob position to i+1 + if (best_eob_cost_cur < best_block_rd_cost) { + best_block_rd_cost = best_eob_cost_cur; + final_eob = i + 1; + if (use_x1) { + before_best_eob_qc = x1; + before_best_eob_dqc = dqc1; + } else { + before_best_eob_qc = x; + before_best_eob_dqc = dqc0; + } + } } - tokens[i][0].best_index = tokens[i][1].best_index = 0; - /* Don't update next, because we didn't add a new node. */ } } - - /* Now pick the best path through the whole trellis. */ - band = band_translate[i + 1]; - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - error0 = tokens[next][0].error; - error1 = tokens[next][1].error; - t0 = tokens[next][0].token; - t1 = tokens[next][1].token; - rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0]; - rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1]; - UPDATE_RD_COST(); - best = rd_cost1 < rd_cost0; - final_eob = -1; - - for (i = next; i < eob; i = next) { - const int x = tokens[i][best].qc; - const int rc = scan[i]; - if (x) final_eob = i; - qcoeff[rc] = x; - dqcoeff[rc] = tokens[i][best].dqc; - next = tokens[i][best].next; - best = tokens[i][best].best_index; + assert(final_eob <= eob); + if (final_eob > 0) { + int rc; + assert(before_best_eob_qc != 0); + i = final_eob - 1; + rc = scan[i]; + qcoeff[rc] = before_best_eob_qc; + dqcoeff[rc] = before_best_eob_dqc; + } + for (i = final_eob; i < eob; i++) { + int rc = scan[i]; + qcoeff[rc] = 0; + dqcoeff[rc] = 0; } - final_eob++; - mb->plane[plane].eobs[block] = final_eob; return final_eob; } +#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE static INLINE void fdct32x32(int rd_transform, const int16_t *src, tran_low_t *dst, int src_stride) { @@ -344,37 +333,35 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, - p->round_fp, p->quant_fp, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; default: assert(0); } @@ -385,28 +372,26 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; case TX_8X8: vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block, - p->zbin, p->round_fp, p->quant_fp, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, + eob, scan_order->scan, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, + scan_order->iscan); break; default: assert(0); break; } @@ -424,6 +409,9 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { switch (tx_size) { @@ -446,7 +434,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, eob); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); @@ -474,7 +462,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, qcoeff, dqcoeff, pd->dequant[0], eob); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; @@ -495,6 +483,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -521,7 +511,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, @@ -554,7 +544,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, scan_order->scan, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); @@ -629,25 +619,26 @@ static void encode_block(int plane, int block, int row, int col, if (x->skip_encode || p->eobs[block] == 0) return; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_4X4: // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], - xd->bd); + x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], + xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -669,7 +660,7 @@ static void encode_block(int plane, int block, int row, int col, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; default: assert(0 && "Invalid transform size"); break; } @@ -691,11 +682,12 @@ static void encode_block_pass1(int plane, int block, int row, int col, if (p->eobs[block] > 0) { #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd); + x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, + p->eobs[block], xd->bd); return; } #endif // CONFIG_VP9_HIGHBITDEPTH - x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); } } @@ -784,12 +776,17 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, } } - vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst, - x->skip_encode ? src_stride : dst_stride, dst, - dst_stride, col, row, plane); + vp9_predict_intra_block( + xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst, + (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst, + dst_stride, col, row, plane); + + // skip block condition should be handled before this is called. + assert(!x->skip_block); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: if (!x->skip_recode) { @@ -801,8 +798,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } + if (args->enable_coeff_opt && !x->skip_recode) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } if (!x->skip_encode && *eob) { - vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; case TX_16X16: @@ -818,8 +818,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, pd->dequant, eob, scan_order->scan, scan_order->iscan); } + if (args->enable_coeff_opt && !x->skip_recode) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } if (!x->skip_encode && *eob) { - vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob, + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; @@ -836,8 +839,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, pd->dequant, eob, scan_order->scan, scan_order->iscan); } + if (args->enable_coeff_opt && !x->skip_recode) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } if (!x->skip_encode && *eob) { - vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob, + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; @@ -848,21 +854,24 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, if (tx_type != DCT_DCT) vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); } - + if (args->enable_coeff_opt && !x->skip_recode) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; + } if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) { // this is like vp9_short_idct4x4 but has a special case around // eob<=1 which is significant (not just an optimization) for the // lossless case. - x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } else { - vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd); + vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type, + xd->bd); } } break; @@ -927,7 +936,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, if (tx_type != DCT_DCT) vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else - x->fwd_txm4x4(src_diff, coeff, diff_stride); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); @@ -940,7 +949,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->itxm_add(dqcoeff, dst, dst_stride, *eob); + x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob); else vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } diff --git a/libs/libvpx/vp9/encoder/vp9_encodemv.c b/libs/libvpx/vp9/encoder/vp9_encodemv.c index 874a8e4b98..023d087c2c 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodemv.c +++ b/libs/libvpx/vp9/encoder/vp9_encodemv.c @@ -208,7 +208,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w, } void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, - const nmv_context *mvctx, int usehp) { + const nmv_context *mvctx, int usehp, + unsigned int *const max_mv_magnitude) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff); usehp = usehp && use_mv_hp(ref); @@ -223,8 +224,8 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, // If auto_mv_step_size is enabled then keep track of the largest // motion vector component used. if (cpi->sf.mv.auto_mv_step_size) { - unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3; - cpi->max_mv_magnitude = VPXMAX(maxv, cpi->max_mv_magnitude); + const unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3; + *max_mv_magnitude = VPXMAX(maxv, *max_mv_magnitude); } } diff --git a/libs/libvpx/vp9/encoder/vp9_encodemv.h b/libs/libvpx/vp9/encoder/vp9_encodemv.h index ad77b8154f..9fc7ab8dc4 100644 --- a/libs/libvpx/vp9/encoder/vp9_encodemv.h +++ b/libs/libvpx/vp9/encoder/vp9_encodemv.h @@ -23,7 +23,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w, nmv_context_counts *const counts); void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, - const nmv_context *mvctx, int usehp); + const nmv_context *mvctx, int usehp, + unsigned int *const max_mv_magnitude); void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], const nmv_context *mvctx, int usehp); diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.c b/libs/libvpx/vp9/encoder/vp9_encoder.c index e8f5ec55ab..2ae59dd981 100644 --- a/libs/libvpx/vp9/encoder/vp9_encoder.c +++ b/libs/libvpx/vp9/encoder/vp9_encoder.c @@ -50,6 +50,7 @@ #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mbgraph.h" +#include "vp9/encoder/vp9_multi_thread.h" #include "vp9/encoder/vp9_noise_estimate.h" #include "vp9/encoder/vp9_picklpf.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -70,13 +71,15 @@ // mv. Choose a very high value for // now so that HIGH_PRECISION is always // chosen. -// #define OUTPUT_YUV_REC + +#define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold +#define FRAME_RATE_FACTOR 8 #ifdef OUTPUT_YUV_DENOISED FILE *yuv_denoised_file = NULL; #endif #ifdef OUTPUT_YUV_SKINMAP -FILE *yuv_skinmap_file = NULL; +static FILE *yuv_skinmap_file = NULL; #endif #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; @@ -99,6 +102,331 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) { } #endif +// compute adaptive threshold for skip recoding +static int compute_context_model_thresh(const VP9_COMP *const cpi) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_size = (cm->width * cm->height) >> 10; + const int bitrate = (int)(oxcf->target_bandwidth >> 10); + const int qindex_factor = cm->base_qindex + (MAXQ >> 1); + + // This equation makes the threshold adaptive to frame size. + // Coding gain obtained by recoding comes from alternate frames of large + // content change. We skip recoding if the difference of previous and current + // frame context probability model is less than a certain threshold. + // The first component is the most critical part to guarantee adaptivity. + // Other parameters are estimated based on normal setting of hd resolution + // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50 + const int thresh = + ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) * + qindex_factor) >> + 9; + + return thresh; +} + +// compute the total cost difference between current +// and previous frame context prob model. +static int compute_context_model_diff(const VP9_COMMON *const cm) { + const FRAME_CONTEXT *const pre_fc = + &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_CONTEXT *const cur_fc = cm->fc; + const FRAME_COUNTS *counts = &cm->counts; + vpx_prob pre_last_prob, cur_last_prob; + int diff = 0; + int i, j, k, l, m, n; + + // y_mode_prob + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + for (j = 0; j < INTRA_MODES - 1; ++j) { + diff += (int)counts->y_mode[i][j] * + (pre_fc->y_mode_prob[i][j] - cur_fc->y_mode_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->y_mode_prob[i][INTRA_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->y_mode_prob[i][INTRA_MODES - 2]; + + diff += (int)counts->y_mode[i][INTRA_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // uv_mode_prob + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES - 1; ++j) { + diff += (int)counts->uv_mode[i][j] * + (pre_fc->uv_mode_prob[i][j] - cur_fc->uv_mode_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->uv_mode_prob[i][INTRA_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->uv_mode_prob[i][INTRA_MODES - 2]; + + diff += (int)counts->uv_mode[i][INTRA_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // partition_prob + for (i = 0; i < PARTITION_CONTEXTS; ++i) { + for (j = 0; j < PARTITION_TYPES - 1; ++j) { + diff += (int)counts->partition[i][j] * + (pre_fc->partition_prob[i][j] - cur_fc->partition_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->partition_prob[i][PARTITION_TYPES - 2]; + cur_last_prob = MAX_PROB - cur_fc->partition_prob[i][PARTITION_TYPES - 2]; + + diff += (int)counts->partition[i][PARTITION_TYPES - 1] * + (pre_last_prob - cur_last_prob); + } + + // coef_probs + for (i = 0; i < TX_SIZES; ++i) { + for (j = 0; j < PLANE_TYPES; ++j) { + for (k = 0; k < REF_TYPES; ++k) { + for (l = 0; l < COEF_BANDS; ++l) { + for (m = 0; m < BAND_COEFF_CONTEXTS(l); ++m) { + for (n = 0; n < UNCONSTRAINED_NODES; ++n) { + diff += (int)counts->coef[i][j][k][l][m][n] * + (pre_fc->coef_probs[i][j][k][l][m][n] - + cur_fc->coef_probs[i][j][k][l][m][n]); + } + + pre_last_prob = + MAX_PROB - + pre_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1]; + cur_last_prob = + MAX_PROB - + cur_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1]; + + diff += (int)counts->coef[i][j][k][l][m][UNCONSTRAINED_NODES] * + (pre_last_prob - cur_last_prob); + } + } + } + } + } + + // switchable_interp_prob + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { + for (j = 0; j < SWITCHABLE_FILTERS - 1; ++j) { + diff += (int)counts->switchable_interp[i][j] * + (pre_fc->switchable_interp_prob[i][j] - + cur_fc->switchable_interp_prob[i][j]); + } + pre_last_prob = + MAX_PROB - pre_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2]; + cur_last_prob = + MAX_PROB - cur_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2]; + + diff += (int)counts->switchable_interp[i][SWITCHABLE_FILTERS - 1] * + (pre_last_prob - cur_last_prob); + } + + // inter_mode_probs + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + for (j = 0; j < INTER_MODES - 1; ++j) { + diff += (int)counts->inter_mode[i][j] * + (pre_fc->inter_mode_probs[i][j] - cur_fc->inter_mode_probs[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->inter_mode_probs[i][INTER_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->inter_mode_probs[i][INTER_MODES - 2]; + + diff += (int)counts->inter_mode[i][INTER_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // intra_inter_prob + for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + diff += (int)counts->intra_inter[i][0] * + (pre_fc->intra_inter_prob[i] - cur_fc->intra_inter_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->intra_inter_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->intra_inter_prob[i]; + + diff += (int)counts->intra_inter[i][1] * (pre_last_prob - cur_last_prob); + } + + // comp_inter_prob + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { + diff += (int)counts->comp_inter[i][0] * + (pre_fc->comp_inter_prob[i] - cur_fc->comp_inter_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->comp_inter_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->comp_inter_prob[i]; + + diff += (int)counts->comp_inter[i][1] * (pre_last_prob - cur_last_prob); + } + + // single_ref_prob + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < 2; ++j) { + diff += (int)counts->single_ref[i][j][0] * + (pre_fc->single_ref_prob[i][j] - cur_fc->single_ref_prob[i][j]); + + pre_last_prob = MAX_PROB - pre_fc->single_ref_prob[i][j]; + cur_last_prob = MAX_PROB - cur_fc->single_ref_prob[i][j]; + + diff += + (int)counts->single_ref[i][j][1] * (pre_last_prob - cur_last_prob); + } + } + + // comp_ref_prob + for (i = 0; i < REF_CONTEXTS; ++i) { + diff += (int)counts->comp_ref[i][0] * + (pre_fc->comp_ref_prob[i] - cur_fc->comp_ref_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->comp_ref_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->comp_ref_prob[i]; + + diff += (int)counts->comp_ref[i][1] * (pre_last_prob - cur_last_prob); + } + + // tx_probs + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + // p32x32 + for (j = 0; j < TX_SIZES - 1; ++j) { + diff += (int)counts->tx.p32x32[i][j] * + (pre_fc->tx_probs.p32x32[i][j] - cur_fc->tx_probs.p32x32[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p32x32[i][TX_SIZES - 2]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p32x32[i][TX_SIZES - 2]; + + diff += (int)counts->tx.p32x32[i][TX_SIZES - 1] * + (pre_last_prob - cur_last_prob); + + // p16x16 + for (j = 0; j < TX_SIZES - 2; ++j) { + diff += (int)counts->tx.p16x16[i][j] * + (pre_fc->tx_probs.p16x16[i][j] - cur_fc->tx_probs.p16x16[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p16x16[i][TX_SIZES - 3]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p16x16[i][TX_SIZES - 3]; + + diff += (int)counts->tx.p16x16[i][TX_SIZES - 2] * + (pre_last_prob - cur_last_prob); + + // p8x8 + for (j = 0; j < TX_SIZES - 3; ++j) { + diff += (int)counts->tx.p8x8[i][j] * + (pre_fc->tx_probs.p8x8[i][j] - cur_fc->tx_probs.p8x8[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p8x8[i][TX_SIZES - 4]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p8x8[i][TX_SIZES - 4]; + + diff += + (int)counts->tx.p8x8[i][TX_SIZES - 3] * (pre_last_prob - cur_last_prob); + } + + // skip_probs + for (i = 0; i < SKIP_CONTEXTS; ++i) { + diff += (int)counts->skip[i][0] * + (pre_fc->skip_probs[i] - cur_fc->skip_probs[i]); + + pre_last_prob = MAX_PROB - pre_fc->skip_probs[i]; + cur_last_prob = MAX_PROB - cur_fc->skip_probs[i]; + + diff += (int)counts->skip[i][1] * (pre_last_prob - cur_last_prob); + } + + // mv + for (i = 0; i < MV_JOINTS - 1; ++i) { + diff += (int)counts->mv.joints[i] * + (pre_fc->nmvc.joints[i] - cur_fc->nmvc.joints[i]); + } + pre_last_prob = MAX_PROB - pre_fc->nmvc.joints[MV_JOINTS - 2]; + cur_last_prob = MAX_PROB - cur_fc->nmvc.joints[MV_JOINTS - 2]; + + diff += + (int)counts->mv.joints[MV_JOINTS - 1] * (pre_last_prob - cur_last_prob); + + for (i = 0; i < 2; ++i) { + const nmv_component_counts *nmv_count = &counts->mv.comps[i]; + const nmv_component *pre_nmv_prob = &pre_fc->nmvc.comps[i]; + const nmv_component *cur_nmv_prob = &cur_fc->nmvc.comps[i]; + + // sign + diff += (int)nmv_count->sign[0] * (pre_nmv_prob->sign - cur_nmv_prob->sign); + + pre_last_prob = MAX_PROB - pre_nmv_prob->sign; + cur_last_prob = MAX_PROB - cur_nmv_prob->sign; + + diff += (int)nmv_count->sign[1] * (pre_last_prob - cur_last_prob); + + // classes + for (j = 0; j < MV_CLASSES - 1; ++j) { + diff += (int)nmv_count->classes[j] * + (pre_nmv_prob->classes[j] - cur_nmv_prob->classes[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->classes[MV_CLASSES - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->classes[MV_CLASSES - 2]; + + diff += (int)nmv_count->classes[MV_CLASSES - 1] * + (pre_last_prob - cur_last_prob); + + // class0 + for (j = 0; j < CLASS0_SIZE - 1; ++j) { + diff += (int)nmv_count->class0[j] * + (pre_nmv_prob->class0[j] - cur_nmv_prob->class0[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->class0[CLASS0_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0[CLASS0_SIZE - 2]; + + diff += (int)nmv_count->class0[CLASS0_SIZE - 1] * + (pre_last_prob - cur_last_prob); + + // bits + for (j = 0; j < MV_OFFSET_BITS; ++j) { + diff += (int)nmv_count->bits[j][0] * + (pre_nmv_prob->bits[j] - cur_nmv_prob->bits[j]); + + pre_last_prob = MAX_PROB - pre_nmv_prob->bits[j]; + cur_last_prob = MAX_PROB - cur_nmv_prob->bits[j]; + + diff += (int)nmv_count->bits[j][1] * (pre_last_prob - cur_last_prob); + } + + // class0_fp + for (j = 0; j < CLASS0_SIZE; ++j) { + for (k = 0; k < MV_FP_SIZE - 1; ++k) { + diff += (int)nmv_count->class0_fp[j][k] * + (pre_nmv_prob->class0_fp[j][k] - cur_nmv_prob->class0_fp[j][k]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->class0_fp[j][MV_FP_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0_fp[j][MV_FP_SIZE - 2]; + + diff += (int)nmv_count->class0_fp[j][MV_FP_SIZE - 1] * + (pre_last_prob - cur_last_prob); + } + + // fp + for (j = 0; j < MV_FP_SIZE - 1; ++j) { + diff += + (int)nmv_count->fp[j] * (pre_nmv_prob->fp[j] - cur_nmv_prob->fp[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->fp[MV_FP_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->fp[MV_FP_SIZE - 2]; + + diff += + (int)nmv_count->fp[MV_FP_SIZE - 1] * (pre_last_prob - cur_last_prob); + + // class0_hp + diff += (int)nmv_count->class0_hp[0] * + (pre_nmv_prob->class0_hp - cur_nmv_prob->class0_hp); + + pre_last_prob = MAX_PROB - pre_nmv_prob->class0_hp; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0_hp; + + diff += (int)nmv_count->class0_hp[1] * (pre_last_prob - cur_last_prob); + + // hp + diff += (int)nmv_count->hp[0] * (pre_nmv_prob->hp - cur_nmv_prob->hp); + + pre_last_prob = MAX_PROB - pre_nmv_prob->hp; + cur_last_prob = MAX_PROB - cur_nmv_prob->hp; + + diff += (int)nmv_count->hp[1] * (pre_last_prob - cur_last_prob); + } + + return -diff; +} + // Test for whether to calculate metrics for the frame. static int is_psnr_calc_enabled(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -108,26 +436,39 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) { } /* clang-format off */ -static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { - { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 }, - { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 }, - { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 }, - { LEVEL_2_1, 9216000, 245760, 3600, 2800, 2, 2, 4, 8 }, - { LEVEL_3, 20736000, 552960, 7200, 6000, 2, 4, 4, 8 }, - { LEVEL_3_1, 36864000, 983040, 12000, 10000, 2, 4, 4, 8 }, - { LEVEL_4, 83558400, 2228224, 18000, 16000, 4, 4, 4, 8 }, - { LEVEL_4_1, 160432128, 2228224, 30000, 18000, 4, 4, 5, 6 }, - { LEVEL_5, 311951360, 8912896, 60000, 36000, 6, 8, 6, 4 }, - { LEVEL_5_1, 588251136, 8912896, 120000, 46000, 8, 8, 10, 4 }, +const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { + // sample rate size breadth bitrate cpb + { LEVEL_1, 829440, 36864, 512, 200, 400, 2, 1, 4, 8 }, + { LEVEL_1_1, 2764800, 73728, 768, 800, 1000, 2, 1, 4, 8 }, + { LEVEL_2, 4608000, 122880, 960, 1800, 1500, 2, 1, 4, 8 }, + { LEVEL_2_1, 9216000, 245760, 1344, 3600, 2800, 2, 2, 4, 8 }, + { LEVEL_3, 20736000, 552960, 2048, 7200, 6000, 2, 4, 4, 8 }, + { LEVEL_3_1, 36864000, 983040, 2752, 12000, 10000, 2, 4, 4, 8 }, + { LEVEL_4, 83558400, 2228224, 4160, 18000, 16000, 4, 4, 4, 8 }, + { LEVEL_4_1, 160432128, 2228224, 4160, 30000, 18000, 4, 4, 5, 6 }, + { LEVEL_5, 311951360, 8912896, 8384, 60000, 36000, 6, 8, 6, 4 }, + { LEVEL_5_1, 588251136, 8912896, 8384, 120000, 46000, 8, 8, 10, 4 }, // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when - // they are finalized (currently TBD). - { LEVEL_5_2, 1176502272, 8912896, 180000, 0, 8, 8, 10, 4 }, - { LEVEL_6, 1176502272, 35651584, 180000, 0, 8, 16, 10, 4 }, - { LEVEL_6_1, 2353004544u, 35651584, 240000, 0, 8, 16, 10, 4 }, - { LEVEL_6_2, 4706009088u, 35651584, 480000, 0, 8, 16, 10, 4 }, + // they are finalized (currently tentative). + { LEVEL_5_2, 1176502272, 8912896, 8384, 180000, 90000, 8, 8, 10, 4 }, + { LEVEL_6, 1176502272, 35651584, 16832, 180000, 90000, 8, 16, 10, 4 }, + { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 }, + { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 }, }; /* clang-format on */ +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { + "The average bit-rate is too high.", + "The picture size is too large.", + "The picture width/height is too large.", + "The luma sample rate is too large.", + "The CPB size is too large.", + "The compression ratio is too small", + "Too many column tiles are used.", + "The alt-ref distance is too small.", + "Too many reference buffers are used." +}; + static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { switch (mode) { case NORMAL: @@ -224,9 +565,12 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { for (i = 0; i < VP9_LEVELS; ++i) { this_level = &vp9_level_defs[i]; - if ((double)level_spec->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) > - (double)this_level->max_luma_sample_rate || + if ((double)level_spec->max_luma_sample_rate > + (double)this_level->max_luma_sample_rate * + (1 + SAMPLE_RATE_GRACE_P) || level_spec->max_luma_picture_size > this_level->max_luma_picture_size || + level_spec->max_luma_picture_breadth > + this_level->max_luma_picture_breadth || level_spec->average_bitrate > this_level->average_bitrate || level_spec->max_cpb_size > this_level->max_cpb_size || level_spec->compression_ratio < this_level->compression_ratio || @@ -399,7 +743,9 @@ void vp9_initialize_enc(void) { vp9_init_me_luts(); vp9_rc_init_minq_luts(); vp9_entropy_mv_init(); +#if !CONFIG_REALTIME_ONLY vp9_temporal_filter_init(); +#endif init_done = 1; } } @@ -439,6 +785,32 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { cpi->nmvsadcosts_hp[0] = NULL; cpi->nmvsadcosts_hp[1] = NULL; + vpx_free(cpi->skin_map); + cpi->skin_map = NULL; + + vpx_free(cpi->prev_partition); + cpi->prev_partition = NULL; + + vpx_free(cpi->svc.prev_partition_svc); + cpi->svc.prev_partition_svc = NULL; + + vpx_free(cpi->prev_segment_id); + cpi->prev_segment_id = NULL; + + vpx_free(cpi->prev_variance_low); + cpi->prev_variance_low = NULL; + + vpx_free(cpi->copied_frame_cnt); + cpi->copied_frame_cnt = NULL; + + vpx_free(cpi->content_state_sb_fd); + cpi->content_state_sb_fd = NULL; + + vpx_free(cpi->count_arf_frame_usage); + cpi->count_arf_frame_usage = NULL; + vpx_free(cpi->count_lastgolden_frame_usage); + cpi->count_lastgolden_frame_usage = NULL; + vp9_cyclic_refresh_free(cpi->cyclic_refresh); cpi->cyclic_refresh = NULL; @@ -468,6 +840,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->tile_tok[0][0]); cpi->tile_tok[0][0] = 0; + vpx_free(cpi->tplist[0][0]); + cpi->tplist[0][0] = NULL; + vp9_free_pc_tree(&cpi->td); for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { @@ -553,6 +928,7 @@ static void restore_coding_context(VP9_COMP *cpi) { *cm->fc = cc->fc; } +#if !CONFIG_REALTIME_ONLY static void configure_static_seg_features(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; @@ -676,6 +1052,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) { } } } +#endif // !CONFIG_REALTIME_ONLY static void update_reference_segmentation_map(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -802,6 +1179,7 @@ static int alloc_context_buffers_ext(VP9_COMP *cpi) { static void alloc_compressor_data(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; + int sb_rows; vp9_alloc_context_buffers(cm, cm->width, cm->height); @@ -815,6 +1193,12 @@ static void alloc_compressor_data(VP9_COMP *cpi) { vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); } + sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + vpx_free(cpi->tplist[0][0]); + CHECK_MEM_ERROR( + cm, cpi->tplist[0][0], + vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0]))); + vp9_setup_pc_tree(&cpi->common, &cpi->td); } @@ -838,6 +1222,14 @@ static void set_tile_limits(VP9_COMP *cpi) { clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); cm->log2_tile_rows = cpi->oxcf.tile_rows; } + + if (cpi->oxcf.target_level == LEVEL_AUTO) { + const int level_tile_cols = + log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height); + if (cm->log2_tile_cols > level_tile_cols) { + cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols); + } + } } static void update_frame_size(VP9_COMP *cpi) { @@ -872,6 +1264,22 @@ static void init_buffer_indices(VP9_COMP *cpi) { cpi->alt_fb_idx = 2; } +static void init_level_constraint(LevelConstraint *lc) { + lc->level_index = -1; + lc->max_cpb_size = INT_MAX; + lc->max_frame_size = INT_MAX; + lc->rc_config_updated = 0; + lc->fail_flag = 0; +} + +static void set_level_constraint(LevelConstraint *ls, int8_t level_index) { + vpx_clear_system_state(); + ls->level_index = level_index; + if (level_index >= 0) { + ls->max_cpb_size = vp9_level_defs[level_index].max_cpb_size * (double)1000; + } +} + static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; @@ -887,6 +1295,8 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { cpi->target_level = oxcf->target_level; cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX; + set_level_constraint(&cpi->level_constraint, + get_level_index(cpi->target_level)); cm->width = oxcf->width; cm->height = oxcf->height; @@ -935,14 +1345,12 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, } #if CONFIG_VP9_HIGHBITDEPTH -#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ +#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; #define MAKE_BFP_SAD_WRAPPER(fnname) \ @@ -981,47 +1389,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, 4; \ } -#define MAKE_BFP_SAD3_WRAPPER(fnname) \ - static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - } \ - static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 3; i++) sad_array[i] >>= 2; \ - } \ - static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 3; i++) sad_array[i] >>= 4; \ - } - -#define MAKE_BFP_SAD8_WRAPPER(fnname) \ - static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - } \ - static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 8; i++) sad_array[i] >>= 2; \ - } \ - static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 8; i++) sad_array[i] >>= 4; \ - } #define MAKE_BFP_SAD4D_WRAPPER(fnname) \ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ @@ -1057,46 +1424,30 @@ MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) static void highbd_set_var_fns(VP9_COMP *const cpi) { @@ -1107,253 +1458,236 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16, - vpx_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL, + vpx_highbd_8_sub_pixel_avg_variance32x16, vpx_highbd_sad32x16x4d_bits8) HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32, - vpx_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL, + vpx_highbd_8_sub_pixel_avg_variance16x32, vpx_highbd_sad16x32x4d_bits8) HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32, - vpx_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL, + vpx_highbd_8_sub_pixel_avg_variance64x32, vpx_highbd_sad64x32x4d_bits8) HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64, - vpx_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL, + vpx_highbd_8_sub_pixel_avg_variance32x64, vpx_highbd_sad32x64x4d_bits8) HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32, vpx_highbd_8_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits8, vpx_highbd_sad32x32x8_bits8, vpx_highbd_sad32x32x4d_bits8) HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64, vpx_highbd_8_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits8, vpx_highbd_sad64x64x8_bits8, vpx_highbd_sad64x64x4d_bits8) HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16, vpx_highbd_8_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits8, vpx_highbd_sad16x16x8_bits8, vpx_highbd_sad16x16x4d_bits8) - HIGHBD_BFP( - BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8, - vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8, - vpx_highbd_8_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8, - vpx_highbd_sad16x8x8_bits8, vpx_highbd_sad16x8x4d_bits8) + HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8, + vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8, + vpx_highbd_8_sub_pixel_variance16x8, + vpx_highbd_8_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits8) - HIGHBD_BFP( - BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8, - vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16, - vpx_highbd_8_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8, - vpx_highbd_sad8x16x8_bits8, vpx_highbd_sad8x16x4d_bits8) + HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8, + vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16, + vpx_highbd_8_sub_pixel_variance8x16, + vpx_highbd_8_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits8) HIGHBD_BFP( BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, - vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8, - vpx_highbd_sad8x8x8_bits8, vpx_highbd_sad8x8x4d_bits8) + vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8) - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, - vpx_highbd_sad8x4_avg_bits8, vpx_highbd_8_variance8x4, - vpx_highbd_8_sub_pixel_variance8x4, - vpx_highbd_8_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits8, vpx_highbd_sad8x4x4d_bits8) + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, + vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, + vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8) - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, - vpx_highbd_sad4x8_avg_bits8, vpx_highbd_8_variance4x8, - vpx_highbd_8_sub_pixel_variance4x8, - vpx_highbd_8_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits8, vpx_highbd_sad4x8x4d_bits8) + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, + vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, + vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8) HIGHBD_BFP( BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, - vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8, - vpx_highbd_sad4x4x8_bits8, vpx_highbd_sad4x4x4d_bits8) + vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8) break; case VPX_BITS_10: HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16, - vpx_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL, + vpx_highbd_10_sub_pixel_avg_variance32x16, vpx_highbd_sad32x16x4d_bits10) HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32, - vpx_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL, + vpx_highbd_10_sub_pixel_avg_variance16x32, vpx_highbd_sad16x32x4d_bits10) HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32, - vpx_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL, + vpx_highbd_10_sub_pixel_avg_variance64x32, vpx_highbd_sad64x32x4d_bits10) HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64, - vpx_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL, + vpx_highbd_10_sub_pixel_avg_variance32x64, vpx_highbd_sad32x64x4d_bits10) HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32, vpx_highbd_10_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits10, vpx_highbd_sad32x32x8_bits10, vpx_highbd_sad32x32x4d_bits10) HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64, vpx_highbd_10_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits10, vpx_highbd_sad64x64x8_bits10, vpx_highbd_sad64x64x4d_bits10) HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16, vpx_highbd_10_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits10, vpx_highbd_sad16x16x8_bits10, vpx_highbd_sad16x16x4d_bits10) HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8, vpx_highbd_10_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x3_bits10, vpx_highbd_sad16x8x8_bits10, vpx_highbd_sad16x8x4d_bits10) HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16, vpx_highbd_10_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x3_bits10, vpx_highbd_sad8x16x8_bits10, vpx_highbd_sad8x16x4d_bits10) - HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10, - vpx_highbd_10_variance8x8, vpx_highbd_10_sub_pixel_variance8x8, - vpx_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10, - vpx_highbd_sad8x8x8_bits10, vpx_highbd_sad8x8x4d_bits10) + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, + vpx_highbd_10_sub_pixel_variance8x8, + vpx_highbd_10_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits10) HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, vpx_highbd_10_sub_pixel_variance8x4, - vpx_highbd_10_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits10, vpx_highbd_sad8x4x4d_bits10) + vpx_highbd_10_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits10) HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, vpx_highbd_10_sub_pixel_variance4x8, - vpx_highbd_10_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits10, vpx_highbd_sad4x8x4d_bits10) + vpx_highbd_10_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits10) - HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10, - vpx_highbd_10_variance4x4, vpx_highbd_10_sub_pixel_variance4x4, - vpx_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10, - vpx_highbd_sad4x4x8_bits10, vpx_highbd_sad4x4x4d_bits10) + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, + vpx_highbd_10_sub_pixel_variance4x4, + vpx_highbd_10_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits10) break; case VPX_BITS_12: HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, - vpx_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL, + vpx_highbd_12_sub_pixel_avg_variance32x16, vpx_highbd_sad32x16x4d_bits12) HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32, - vpx_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL, + vpx_highbd_12_sub_pixel_avg_variance16x32, vpx_highbd_sad16x32x4d_bits12) HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32, - vpx_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL, + vpx_highbd_12_sub_pixel_avg_variance64x32, vpx_highbd_sad64x32x4d_bits12) HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64, - vpx_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL, + vpx_highbd_12_sub_pixel_avg_variance32x64, vpx_highbd_sad32x64x4d_bits12) HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32, vpx_highbd_12_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits12, vpx_highbd_sad32x32x8_bits12, vpx_highbd_sad32x32x4d_bits12) HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64, vpx_highbd_12_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits12, vpx_highbd_sad64x64x8_bits12, vpx_highbd_sad64x64x4d_bits12) HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16, vpx_highbd_12_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits12, vpx_highbd_sad16x16x8_bits12, vpx_highbd_sad16x16x4d_bits12) HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8, vpx_highbd_12_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x3_bits12, vpx_highbd_sad16x8x8_bits12, vpx_highbd_sad16x8x4d_bits12) HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16, vpx_highbd_12_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x3_bits12, vpx_highbd_sad8x16x8_bits12, vpx_highbd_sad8x16x4d_bits12) - HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12, - vpx_highbd_12_variance8x8, vpx_highbd_12_sub_pixel_variance8x8, - vpx_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12, - vpx_highbd_sad8x8x8_bits12, vpx_highbd_sad8x8x4d_bits12) + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, + vpx_highbd_12_sub_pixel_variance8x8, + vpx_highbd_12_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits12) HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, vpx_highbd_12_sub_pixel_variance8x4, - vpx_highbd_12_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits12, vpx_highbd_sad8x4x4d_bits12) + vpx_highbd_12_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits12) HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, vpx_highbd_12_sub_pixel_variance4x8, - vpx_highbd_12_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits12, vpx_highbd_sad4x8x4d_bits12) + vpx_highbd_12_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits12) - HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12, - vpx_highbd_12_variance4x4, vpx_highbd_12_sub_pixel_variance4x4, - vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12, - vpx_highbd_sad4x4x8_bits12, vpx_highbd_sad4x4x4d_bits12) + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, + vpx_highbd_12_sub_pixel_variance4x4, + vpx_highbd_12_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits12) break; default: @@ -1390,6 +1724,33 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) { vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); } +static void alloc_copy_partition_data(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (cpi->prev_partition == NULL) { + CHECK_MEM_ERROR(cm, cpi->prev_partition, + (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, + sizeof(*cpi->prev_partition))); + } + if (cpi->prev_segment_id == NULL) { + CHECK_MEM_ERROR( + cm, cpi->prev_segment_id, + (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->prev_segment_id))); + } + if (cpi->prev_variance_low == NULL) { + CHECK_MEM_ERROR(cm, cpi->prev_variance_low, + (uint8_t *)vpx_calloc( + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25, + sizeof(*cpi->prev_variance_low))); + } + if (cpi->copied_frame_cnt == NULL) { + CHECK_MEM_ERROR( + cm, cpi->copied_frame_cnt, + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->copied_frame_cnt))); + } +} + void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -1403,6 +1764,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->target_level = oxcf->target_level; cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX; + set_level_constraint(&cpi->level_constraint, + get_level_index(cpi->target_level)); if (cm->profile <= PROFILE_1) assert(cm->bit_depth == VPX_BITS_8); @@ -1481,13 +1844,17 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { } } - update_frame_size(cpi); + if (cm->current_video_frame == 0 || last_w != cpi->oxcf.width || + last_h != cpi->oxcf.height) + update_frame_size(cpi); if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { memset(cpi->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_reset_resize(cpi); + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; } if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || @@ -1498,6 +1865,24 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { (int)cpi->oxcf.target_bandwidth); } + // Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the + // configuration change has a large change in avg_frame_bandwidth. + // For SVC check for resetting based on spatial layer average bandwidth. + // Also reset buffer level to optimal level. + if (cm->current_video_frame > 0) { + if (cpi->use_svc) { + vp9_svc_check_reset_layer_rc_flag(cpi); + } else { + if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) || + rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) { + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + rc->bits_off_target = rc->optimal_buffer_level; + rc->buffer_level = rc->optimal_buffer_level; + } + } + } + cpi->alt_ref_source = NULL; rc->is_src_frame_alt_ref = 0; @@ -1515,6 +1900,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); #endif + + vp9_set_row_mt(cpi); } #ifndef M_LOG2_E @@ -1612,7 +1999,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); cpi->use_svc = 0; - cpi->resize_state = 0; + cpi->resize_state = ORIG; cpi->external_resize = 0; cpi->resize_avg_qp = 0; cpi->resize_buffer_underflow = 0; @@ -1630,6 +2017,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, realloc_segmentation_maps(cpi); + CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); CHECK_MEM_ERROR( @@ -1677,6 +2067,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; init_level_info(&cpi->level_info); + init_level_constraint(&cpi->level_constraint); #if CONFIG_INTERNAL_STATS cpi->b_calculate_blockiness = 1; @@ -1745,7 +2136,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, #endif #endif #ifdef OUTPUT_YUV_SKINMAP - yuv_skinmap_file = fopen("skinmap.yuv", "ab"); + yuv_skinmap_file = fopen("skinmap.yuv", "wb"); #endif #ifdef OUTPUT_YUV_REC yuv_rec_file = fopen("rec.yuv", "wb"); @@ -1758,6 +2149,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; +#if !CONFIG_REALTIME_ONLY if (oxcf->pass == 1) { vp9_init_first_pass(cpi); } else if (oxcf->pass == 2) { @@ -1822,6 +2214,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, vp9_init_second_pass(cpi); } } +#endif // !CONFIG_REALTIME_ONLY vp9_set_speed_features_framesize_independent(cpi); vp9_set_speed_features_framesize_dependent(cpi); @@ -1831,67 +2224,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16, - vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, NULL, NULL, + vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d) BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32, - vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, NULL, NULL, + vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d) BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32, - vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, NULL, NULL, + vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d) BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64, - vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, NULL, NULL, + vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d) BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d) + vpx_sad32x32x4d) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, - vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d) + vpx_sad64x64x4d) BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16, vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16, - vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d) + vpx_sad16x16x4d) BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8, - vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x3, - vpx_sad16x8x8, vpx_sad16x8x4d) + vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, + vpx_sad16x8x4d) BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16, - vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x3, - vpx_sad8x16x8, vpx_sad8x16x4d) + vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, + vpx_sad8x16x4d) BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8, - vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x3, - vpx_sad8x8x8, vpx_sad8x8x4d) + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d) BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4, - vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, NULL, - vpx_sad8x4x8, vpx_sad8x4x4d) + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d) BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8, - vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, NULL, - vpx_sad4x8x8, vpx_sad4x8x4d) + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d) BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4, - vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x3, - vpx_sad4x4x8, vpx_sad4x4x4d) + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); @@ -1958,16 +2345,20 @@ void vp9_remove_compressor(VP9_COMP *cpi) { snprintf(headings, sizeof(headings), "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" - "WstPsnr\tWstSsim\tWstFast\tWstHVS"); + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsnrY\tAPsnrCb\tAPsnrCr"); snprintf(results, sizeof(results), "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" - "%7.3f\t%7.3f\t%7.3f\t%7.3f", + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr, cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim, totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count, cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst, - cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst); + cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst, + cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count, + cpi->psnr.stat[V] / cpi->count); if (cpi->b_calculate_blockiness) { SNPRINT(headings, "\t Block\tWstBlck"); @@ -2027,8 +2418,12 @@ void vp9_remove_compressor(VP9_COMP *cpi) { } vpx_free(cpi->tile_thr_data); vpx_free(cpi->workers); + vp9_row_mt_mem_dealloc(cpi); - if (cpi->num_workers > 1) vp9_loop_filter_dealloc(&cpi->lf_row_sync); + if (cpi->num_workers > 1) { + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + } vp9_alt_ref_aq_destroy(cpi->alt_ref_aq); @@ -2136,7 +2531,7 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); if (cfg) { - vp8_yv12_copy_frame(cfg, sd); + vpx_yv12_copy_frame(cfg, sd); return 0; } else { return -1; @@ -2147,7 +2542,7 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); if (cfg) { - vp8_yv12_copy_frame(sd, cfg); + vpx_yv12_copy_frame(sd, cfg); return 0; } else { return -1; @@ -2160,38 +2555,6 @@ int vp9_update_entropy(VP9_COMP *cpi, int update) { return 0; } -#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP) -// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it -// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do -// not denoise the UV channels at this time. If ever we implement UV channel -// denoising we will have to modify this. -void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { - uint8_t *src = s->y_buffer; - int h = s->y_height; - - do { - fwrite(src, s->y_width, 1, f); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, f); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, f); - src += s->uv_stride; - } while (--h); -} -#endif - #ifdef OUTPUT_YUV_REC void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { YV12_BUFFER_CONFIG *s = cm->frame_to_show; @@ -2297,7 +2660,9 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, #if CONFIG_VP9_HIGHBITDEPTH static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int bd) { + YV12_BUFFER_CONFIG *dst, int bd, + INTERP_FILTER filter_type, + int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; @@ -2307,7 +2672,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; for (i = 0; i < MAX_MB_PLANE; ++i) { @@ -2315,24 +2680,24 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_stride = src_strides[i]; const int dst_stride = dst_strides[i]; for (y = 0; y < dst_h; y += 16) { - const int y_q4 = y * (16 / factor) * src_h / dst_h; + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; for (x = 0; x < dst_w; x += 16) { - const int x_q4 = x * (16 / factor) * src_w / dst_w; + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h * src_stride + (x / factor) * src_w / dst_w; uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); if (src->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, - 16 / factor, 16 / factor, bd); + vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor, + bd); } else { - vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, - 16 / factor); + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); } } } @@ -2340,44 +2705,6 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, vpx_extend_frame_borders(dst); } -#else -void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { - const int src_w = src->y_crop_width; - const int src_h = src->y_crop_height; - const int dst_w = dst->y_crop_width; - const int dst_h = dst->y_crop_height; - const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, - src->v_buffer }; - const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; - uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; - const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; - int x, y, i; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - const int factor = (i == 0 || i == 3 ? 1 : 2); - const int src_stride = src_strides[i]; - const int dst_stride = dst_strides[i]; - for (y = 0; y < dst_h; y += 16) { - const int y_q4 = y * (16 / factor) * src_h / dst_h; - for (x = 0; x < dst_w; x += 16) { - const int x_q4 = x * (16 / factor) * src_w / dst_w; - const uint8_t *src_ptr = srcs[i] + - (y / factor) * src_h / dst_h * src_stride + - (x / factor) * src_w / dst_w; - uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); - - vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, - 16 / factor); - } - } - } - - vpx_extend_frame_borders(dst); -} #endif // CONFIG_VP9_HIGHBITDEPTH static int scale_down(VP9_COMP *cpi, int q) { @@ -2396,11 +2723,33 @@ static int scale_down(VP9_COMP *cpi, int q) { return scale; } -static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) { +static int big_rate_miss_high_threshold(VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; + int big_miss_high; - return (rc->projected_frame_size > ((high_limit * 3) / 2)) || - (rc->projected_frame_size < (low_limit / 2)); + if (frame_is_kf_gf_arf(cpi)) + big_miss_high = rc->this_frame_target * 3 / 2; + else + big_miss_high = rc->this_frame_target * 2; + + return big_miss_high; +} + +static int big_rate_miss(VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + int big_miss_high; + int big_miss_low; + + // Ignore for overlay frames + if (rc->is_src_frame_alt_ref) { + return 0; + } else { + big_miss_low = (rc->this_frame_target / 2); + big_miss_high = big_rate_miss_high_threshold(cpi); + + return (rc->projected_frame_size > big_miss_high) || + (rc->projected_frame_size < big_miss_low); + } } // test in two pass for the first @@ -2425,8 +2774,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, int force_recode = 0; if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || - big_rate_miss(cpi, high_limit, low_limit) || - (cpi->sf.recode_loop == ALLOW_RECODE) || + big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) || (two_pass_first_group_inter(cpi) && (cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) || (frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) { @@ -2437,6 +2785,13 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, return 1; } + // Force recode for extreme overshoot. + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF && + rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) { + return 1; + } + // TODO(agrange) high_limit could be greater than the scale-down threshold. if ((rc->projected_frame_size > high_limit && q < maxq) || (rc->projected_frame_size < low_limit && q > minq)) { @@ -2523,12 +2878,41 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { sizeof(cpi->interp_filter_selected[0])); } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && cpi->denoiser.denoising_level > kDenLowLow) { + int svc_base_is_key = 0; + int denoise_svc_second_layer = 0; + if (cpi->use_svc) { + int realloc_fail = 0; + const int svc_buf_shift = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 + ? cpi->denoiser.num_ref_frames + : 0; + int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, + cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + svc_base_is_key = lc->is_key_frame; + denoise_svc_second_layer = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1 + : 0; + // Check if we need to allocate extra buffers in the denoiser + // for + // refreshed frames. + realloc_fail = vp9_denoiser_realloc_svc( + cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx, + cpi->gld_fb_idx, cpi->lst_fb_idx); + if (realloc_fail) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to re-allocate denoiser for SVC"); + } vp9_denoiser_update_frame_info( &cpi->denoiser, *cpi->Source, cpi->common.frame_type, cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, - cpi->refresh_last_frame, cpi->resize_pending); + cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, + cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key, + denoise_svc_second_layer); } #endif if (is_one_pass_cbr_svc(cpi)) { @@ -2553,6 +2937,10 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { MACROBLOCKD *xd = &cpi->td.mb.e_mbd; struct loopfilter *lf = &cm->lf; + const int is_reference_frame = + (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || + cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + if (xd->lossless) { lf->filter_level = 0; lf->last_filt_level = 0; @@ -2578,7 +2966,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); } - if (lf->filter_level > 0) { + if (lf->filter_level > 0 && is_reference_frame) { vp9_build_mask_frame(cm, lf->filter_level, 0); if (cpi->num_workers > 1) @@ -2643,7 +3031,8 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth); + scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, + EIGHTTAP, 0); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } @@ -2666,7 +3055,7 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf); + vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } @@ -2794,17 +3183,37 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { dc_quant_devisor = 4.0; #endif - fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d" - "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " - "%10"PRId64" %10"PRId64" %10d " - "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" - "%6d %6d %5d %5d %5d " - "%10"PRId64" %10.3lf" - "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n", + if (!cm->current_video_frame) { + fprintf(f, "frame, width, height, last ts, last end ts, " + "source_alt_ref_pending, source_alt_ref_active, " + "this_frame_target, projected_frame_size, " + "projected_frame_size / MBs, " + "projected_frame_size - this_frame_target, " + "vbr_bits_off_target, vbr_bits_off_target_fast, " + "twopass.extend_minq, twopass.extend_minq_fast, " + "total_target_vs_actual, " + "starting_buffer_level - bits_off_target, " + "total_actual_bits, base_qindex, q for base_qindex, " + "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, " + "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, " + "frame_type, gfu_boost, " + "twopass.bits_left, " + "twopass.total_left_stats.coded_error, " + "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), " + "tot_recode_hits, recon_err, kf_boost, " + "twopass.kf_zeromotion_pct, twopass.fr_content_type, " + "filter_level, seg.aq_av_offset\n"); + } + + fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, " + "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", " + "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, " + "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, " + "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n", cpi->common.current_video_frame, cm->width, cm->height, - cpi->td.rd_counts.m_search_count, - cpi->td.rd_counts.ex_search_count, + cpi->last_time_stamp_seen, + cpi->last_end_time_stamp_seen, cpi->rc.source_alt_ref_pending, cpi->rc.source_alt_ref_active, cpi->rc.this_frame_target, @@ -2892,7 +3301,6 @@ static void set_size_independent_vars(VP9_COMP *cpi) { static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, int *top_index) { VP9_COMMON *const cm = &cpi->common; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; // Setup variables that depend on the dimensions of the frame. vp9_set_speed_features_framesize_dependent(cpi); @@ -2904,17 +3312,19 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); } +#if !CONFIG_REALTIME_ONLY // Configure experimental use of segmentation for enhanced coding of // static regions if indicated. // Only allowed in the second pass of a two pass encode, as it requires // lagged coding, and if the relevant speed feature flag is set. - if (oxcf->pass == 2 && cpi->sf.static_segmentation) + if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation) configure_static_seg_features(cpi); +#endif // !CONFIG_REALTIME_ONLY #if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING) - if (oxcf->noise_sensitivity > 0) { + if (cpi->oxcf.noise_sensitivity > 0) { int l = 0; - switch (oxcf->noise_sensitivity) { + switch (cpi->oxcf.noise_sensitivity) { case 1: l = 20; break; case 2: l = 40; break; case 3: l = 60; break; @@ -2923,8 +3333,9 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, case 6: l = 150; break; } if (!cpi->common.postproc_state.limits) { - cpi->common.postproc_state.limits = vpx_calloc( - cpi->common.width, sizeof(*cpi->common.postproc_state.limits)); + cpi->common.postproc_state.limits = + vpx_calloc(cpi->un_scaled_source->y_width, + sizeof(*cpi->common.postproc_state.limits)); } vp9_denoise(cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits); } @@ -2936,7 +3347,8 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; if (cpi->oxcf.noise_sensitivity > 0 && !cpi->denoiser.frame_buffer_initialized) { - if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height, + if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc, + cpi->oxcf.noise_sensitivity, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, @@ -2964,6 +3376,7 @@ static void set_frame_size(VP9_COMP *cpi) { VP9EncoderConfig *const oxcf = &cpi->oxcf; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; +#if !CONFIG_REALTIME_ONLY if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR && ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) || (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) { @@ -2974,6 +3387,7 @@ static void set_frame_size(VP9_COMP *cpi) { vp9_set_size_literal(cpi, oxcf->scaled_frame_width, oxcf->scaled_frame_height); } +#endif // !CONFIG_REALTIME_ONLY if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && !cpi->use_svc && oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) { @@ -3054,6 +3468,19 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. + const INTERP_FILTER filter_scaler = + (is_one_pass_cbr_svc(cpi)) + ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] + : EIGHTTAP; + const int phase_scaler = + (is_one_pass_cbr_svc(cpi)) + ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] + : 0; + + // Flag to check if its valid to compute the source sad (used for + // scene detection and for superblock content state in CBR mode). + // The flag may get reset below based on SVC or resizing state. + cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME; vpx_clear_system_state(); @@ -3067,8 +3494,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2 // result will be saved in scaled_temp and might be used later. + const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1]; + const int phase_scaler2 = cpi->svc.downsample_filter_phase[1]; cpi->Source = vp9_svc_twostage_scale( - cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp); + cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp, + filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); cpi->svc.scaled_one_half = 1; } else if (is_one_pass_cbr_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 1 && @@ -3080,16 +3510,17 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->svc.scaled_one_half = 0; } else { cpi->Source = vp9_scale_if_required( - cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0)); + cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0), + filter_scaler, phase_scaler); } // Unfiltered raw source used in metrics calculation if the source // has been filtered. if (is_psnr_calc_enabled(cpi)) { #ifdef ENABLE_KF_DENOISE if (is_spatial_denoise_enabled(cpi)) { - cpi->raw_source_frame = - vp9_scale_if_required(cm, &cpi->raw_unscaled_source, - &cpi->raw_scaled_source, (cpi->oxcf.pass == 0)); + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler); } else { cpi->raw_source_frame = cpi->Source; } @@ -3098,8 +3529,21 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, #endif } + if ((cpi->use_svc && + (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 || + cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 || + cpi->svc.current_superframe < 1)) || + cpi->resize_pending || cpi->resize_state || cpi->external_resize || + cpi->resize_state != ORIG) { + cpi->compute_source_sad_onepass = 0; + if (cpi->content_state_sb_fd != NULL) + memset(cpi->content_state_sb_fd, 0, + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * + sizeof(*cpi->content_state_sb_fd)); + } + // Avoid scaling last_source unless its needed. - // Last source is needed if vp9_avg_source_sad() is used, or if + // Last source is needed if avg_source_sad() is used, or if // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise // estimation is enabled. if (cpi->unscaled_last_source != NULL && @@ -3107,10 +3551,16 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) || cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION || - cpi->noise_estimate.enabled)) - cpi->Last_Source = - vp9_scale_if_required(cm, cpi->unscaled_last_source, - &cpi->scaled_last_source, (cpi->oxcf.pass == 0)); + (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) || + cpi->compute_source_sad_onepass)) + cpi->Last_Source = vp9_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + (cpi->oxcf.pass == 0), EIGHTTAP, 0); + + if (cpi->Last_Source == NULL || + cpi->Last_Source->y_width != cpi->Source->y_width || + cpi->Last_Source->y_height != cpi->Source->y_height) + cpi->compute_source_sad_onepass = 0; if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) { memset(cpi->consec_zero_mv, 0, @@ -3119,22 +3569,39 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, vp9_update_noise_estimate(cpi); - if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME && - cpi->oxcf.speed >= 5 && cpi->resize_state == 0 && - (cpi->oxcf.content == VP9E_CONTENT_SCREEN || - cpi->oxcf.rc_mode == VPX_VBR)) - vp9_avg_source_sad(cpi); + // Scene detection is always used for VBR mode or screen-content case. + // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now + // (need to check encoding time cost for doing this for speed 8). + cpi->rc.high_source_sad = 0; + if (cpi->compute_source_sad_onepass && cm->show_frame && + (cpi->oxcf.rc_mode == VPX_VBR || + cpi->oxcf.content == VP9E_CONTENT_SCREEN || + (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc))) + vp9_scene_detection_onepass(cpi); - // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference - // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this - // frame-level upsampling. - if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) { + // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame + // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can + // avoid this frame-level upsampling (for non intra_only frames). + if (frame_is_intra_only(cm) == 0 && + !(is_one_pass_cbr_svc(cpi) && cpi->svc.force_zero_mode_spatial_ref)) { vp9_scale_references(cpi); } set_size_independent_vars(cpi); set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi); + + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) { + if (cpi->svc.prev_partition_svc == NULL) { + CHECK_MEM_ERROR( + cm, cpi->svc.prev_partition_svc, + (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, + sizeof(*cpi->svc.prev_partition_svc))); + } + } + if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.content != VP9E_CONTENT_SCREEN && @@ -3143,7 +3610,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, } vp9_set_quantizer(cm, q); - vp9_set_variance_partition_thresholds(cpi, q); + vp9_set_variance_partition_thresholds(cpi, q, 0); setup_frame(cpi); @@ -3172,7 +3639,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // Check if we should drop this frame because of high overshoot. // Only for frames where high temporal-source SAD is detected. if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && - cpi->resize_state == 0 && cm->frame_type != KEY_FRAME && + cpi->resize_state == ORIG && cm->frame_type != KEY_FRAME && cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->rc.high_source_sad == 1) { int frame_size = 0; @@ -3186,7 +3653,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (vp9_encodedframe_overshoot(cpi, frame_size, &q)) { vpx_clear_system_state(); vp9_set_quantizer(cm, q); - vp9_set_variance_partition_thresholds(cpi, q); + vp9_set_variance_partition_thresholds(cpi, q, 0); suppress_active_map(cpi); // Turn-off cyclic refresh for re-encoded frame. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { @@ -3199,12 +3666,10 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, } } - // Update some stats from cyclic refresh, and check if we should not update - // golden reference, for non-SVC 1 pass CBR. - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME && - !cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 && - (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR)) - vp9_cyclic_refresh_check_golden_update(cpi); + // Update some stats from cyclic refresh, and check for golden frame update. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + cm->frame_type != KEY_FRAME) + vp9_cyclic_refresh_postencode(cpi); // Update the skip mb flag probabilities based on the distribution // seen in the last encoder iteration. @@ -3212,8 +3677,16 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, vpx_clear_system_state(); } +#define MAX_QSTEP_ADJ 4 +static int get_qstep_adj(int rate_excess, int rate_limit) { + int qstep = + rate_limit ? ((rate_excess + rate_limit / 2) / rate_limit) : INT_MAX; + return VPXMIN(qstep, MAX_QSTEP_ADJ); +} + static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int bottom_index, top_index; @@ -3225,9 +3698,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, int frame_over_shoot_limit; int frame_under_shoot_limit; int q = 0, q_low = 0, q_high = 0; + int enable_acl; +#ifdef AGGRESSIVE_VBR + int qrange_adj = 1; +#endif set_size_independent_vars(cpi); + enable_acl = cpi->sf.allow_acl + ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0) + : 0; + do { vpx_clear_system_state(); @@ -3236,6 +3717,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (loop_count == 0 || cpi->resize_pending != 0) { set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); +#ifdef AGGRESSIVE_VBR + if (two_pass_first_group_inter(cpi)) { + // Adjustment limits for min and max q + qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2); + + bottom_index = + VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q); + top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2); + } +#endif // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. set_mv_search_params(cpi); @@ -3259,8 +3750,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, &frame_over_shoot_limit); } - cpi->Source = vp9_scale_if_required( - cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0)); + cpi->Source = + vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source, + (oxcf->pass == 0), EIGHTTAP, 0); // Unfiltered raw source used in metrics calculation if the source // has been filtered. @@ -3269,7 +3761,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) { cpi->raw_source_frame = vp9_scale_if_required( cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, - (cpi->oxcf.pass == 0)); + (oxcf->pass == 0), EIGHTTAP, 0); } else { cpi->raw_source_frame = cpi->Source; } @@ -3281,7 +3773,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (cpi->unscaled_last_source != NULL) cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, &cpi->scaled_last_source, - (cpi->oxcf.pass == 0)); + (oxcf->pass == 0), EIGHTTAP, 0); if (frame_is_intra_only(cm) == 0) { if (loop_count > 0) { @@ -3296,13 +3788,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + if (oxcf->aq_mode == VARIANCE_AQ) { vp9_vaq_frame_setup(cpi); - } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) { + } else if (oxcf->aq_mode == EQUATOR360_AQ) { vp9_360aq_frame_setup(cpi); - } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + } else if (oxcf->aq_mode == COMPLEXITY_AQ) { vp9_setup_in_frame_q_adj(cpi); - } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) { + } else if (oxcf->aq_mode == LOOKAHEAD_AQ) { vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); } @@ -3322,12 +3814,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size); rc->projected_frame_size = (int)(*size) << 3; - restore_coding_context(cpi); if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; } - if (cpi->oxcf.rc_mode == VPX_Q) { + if (oxcf->rc_mode == VPX_Q) { loop = 0; } else { if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced && @@ -3385,6 +3876,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // to attempt to recode. int last_q = q; int retries = 0; + int qstep; if (cpi->resize_pending == 1) { // Change in frame size so go back around the recode loop. @@ -3406,11 +3898,24 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // Frame is too large if (rc->projected_frame_size > rc->this_frame_target) { // Special case if the projected size is > the max allowed. - if (rc->projected_frame_size >= rc->max_frame_bandwidth) - q_high = rc->worst_quality; + if ((q == q_high) && + ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (rc->projected_frame_size >= + big_rate_miss_high_threshold(cpi)))) { + int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth, + big_rate_miss_high_threshold(cpi))); + double q_val_high; + q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth); + q_val_high = + q_val_high * ((double)rc->projected_frame_size / max_rate); + q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth); + q_high = clamp(q_high, rc->best_quality, rc->worst_quality); + } // Raise Qlow as to at least the current value - q_low = q < q_high ? q + 1 : q_high; + qstep = + get_qstep_adj(rc->projected_frame_size, rc->this_frame_target); + q_low = VPXMIN(q + qstep, q_high); if (undershoot_seen || loop_at_this_size > 1) { // Update rate_correction_factor unless @@ -3435,31 +3940,32 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, overshoot_seen = 1; } else { // Frame is too small - q_high = q > q_low ? q - 1 : q_low; + qstep = + get_qstep_adj(rc->this_frame_target, rc->projected_frame_size); + q_high = VPXMAX(q - qstep, q_low); if (overshoot_seen || loop_at_this_size > 1) { vp9_rc_update_rate_correction_factors(cpi); q = (q_high + q_low) / 2; } else { vp9_rc_update_rate_correction_factors(cpi); - q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - top_index); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + VPXMIN(q_low, bottom_index), top_index); // Special case reset for qlow for constrained quality. // This should only trigger where there is very substantial // undershoot on a frame and the auto cq level is above // the user passsed in value. - if (cpi->oxcf.rc_mode == VPX_CQ && q < q_low) { + if (oxcf->rc_mode == VPX_CQ && q < q_low) { q_low = q; } while (q > q_high && retries < 10) { vp9_rc_update_rate_correction_factors(cpi); - q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - top_index); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + VPXMIN(q_low, bottom_index), top_index); retries++; } } - undershoot_seen = 1; } @@ -3485,7 +3991,50 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, ++cpi->tot_recode_hits; #endif } + + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) + if (loop || !enable_acl) restore_coding_context(cpi); } while (loop); + +#ifdef AGGRESSIVE_VBR + if (two_pass_first_group_inter(cpi)) { + cpi->twopass.active_worst_quality = + VPXMIN(q + qrange_adj, oxcf->worst_allowed_q); + } else if (!frame_is_kf_gf_arf(cpi)) { +#else + if (!frame_is_kf_gf_arf(cpi)) { +#endif + // Have we been forced to adapt Q outside the expected range by an extreme + // rate miss. If so adjust the active maxQ for the subsequent frames. + if (q > cpi->twopass.active_worst_quality) { + cpi->twopass.active_worst_quality = q; + } else if (oxcf->vbr_corpus_complexity && q == q_low && + rc->projected_frame_size < rc->this_frame_target) { + cpi->twopass.active_worst_quality = + VPXMAX(q, cpi->twopass.active_worst_quality - 1); + } + } + + if (enable_acl) { + // Skip recoding, if model diff is below threshold + const int thresh = compute_context_model_thresh(cpi); + const int diff = compute_context_model_diff(cm); + if (diff < thresh) { + vpx_clear_system_state(); + restore_coding_context(cpi); + return; + } + + vp9_encode_frame(cpi); + vpx_clear_system_state(); + restore_coding_context(cpi); + vp9_pack_bitstream(cpi, dest, size); + + vp9_encode_frame(cpi); + vpx_clear_system_state(); + + restore_coding_context(cpi); + } } static int get_ref_frame_flags(const VP9_COMP *cpi) { @@ -3525,18 +4074,28 @@ static void set_ext_overrides(VP9_COMP *cpi) { } } -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp) { +YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH - scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth); - scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth); + if (cm->bit_depth == VPX_BITS_8) { + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, + phase_scaler); + } else { + scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, + filter_type2, phase_scaler2); + scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); + } #else - vp9_scale_and_extend_frame(unscaled, scaled_temp); - vp9_scale_and_extend_frame(scaled_temp, scaled); + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); #endif // CONFIG_VP9_HIGHBITDEPTH return scaled; } else { @@ -3544,22 +4103,25 @@ YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, } } -YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - int use_normative_scaler) { +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) { if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && unscaled->y_height <= (scaled->y_height << 1)) - scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth); + if (cm->bit_depth == VPX_BITS_8) + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); + else + scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); else scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); #else if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && unscaled->y_height <= (scaled->y_height << 1)) - vp9_scale_and_extend_frame(unscaled, scaled); + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); else scale_and_extend_frame_nonnormative(unscaled, scaled); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -3892,12 +4454,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, ++cm->current_video_frame; cpi->ext_refresh_frame_flags_pending = 0; cpi->svc.rc_drop_superframe = 1; + cpi->last_frame_dropped = 1; // TODO(marpan): Advancing the svc counters on dropped frames can break // the referencing scheme for the fixed svc patterns defined in // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but // for now, don't advance the svc frame counters on dropped frame. // if (cpi->use_svc) // vp9_inc_frame_in_layer(cpi); + return; } } @@ -3915,21 +4479,23 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, encode_with_recode_loop(cpi, size, dest); } + cpi->last_frame_dropped = 0; + // Disable segmentation if it decrease rate/distortion ratio if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) vp9_try_disable_lookahead_aq(cpi, size, dest); #if CONFIG_VP9_TEMPORAL_DENOISING #ifdef OUTPUT_YUV_DENOISED - if (oxcf->noise_sensitivity > 0) { - vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME], - yuv_denoised_file); + if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) { + vpx_write_yuv_frame(yuv_denoised_file, + &cpi->denoiser.running_avg_y[INTRA_FRAME]); } #endif #endif #ifdef OUTPUT_YUV_SKINMAP if (cpi->common.current_video_frame > 1) { - vp9_compute_skin_map(cpi, yuv_skinmap_file); + vp9_output_skin_map(cpi, yuv_skinmap_file); } #endif @@ -4066,6 +4632,7 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, encode_frame_to_data_rate(cpi, size, dest, frame_flags); } +#if !CONFIG_REALTIME_ONLY static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; @@ -4074,6 +4641,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) vp9_twopass_postencode_update(cpi); } +#endif // !CONFIG_REALTIME_ONLY static void init_ref_frame_bufs(VP9_COMMON *cm) { int i; @@ -4268,6 +4836,26 @@ static void adjust_image_stat(double y, double u, double v, double all, } #endif // CONFIG_INTERNAL_STATS +// Adjust the maximum allowable frame size for the target level. +static void level_rc_framerate(VP9_COMP *cpi, int arf_src_index) { + RATE_CONTROL *const rc = &cpi->rc; + LevelConstraint *const ls = &cpi->level_constraint; + VP9_COMMON *const cm = &cpi->common; + const double max_cpb_size = ls->max_cpb_size; + vpx_clear_system_state(); + rc->max_frame_bandwidth = VPXMIN(rc->max_frame_bandwidth, ls->max_frame_size); + if (frame_is_intra_only(cm)) { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.5)); + } else if (arf_src_index > 0) { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.4)); + } else { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.2)); + } +} + static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { VP9_COMMON *const cm = &cpi->common; Vp9LevelInfo *const level_info = &cpi->level_info; @@ -4276,6 +4864,9 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { int i, idx; uint64_t luma_samples, dur_end; const uint32_t luma_pic_size = cm->width * cm->height; + const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height); + LevelConstraint *const level_constraint = &cpi->level_constraint; + const int8_t level_index = level_constraint->level_index; double cpb_data_size; vpx_clear_system_state(); @@ -4377,6 +4968,11 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { level_spec->max_luma_picture_size = luma_pic_size; } + // update max_luma_picture_breadth + if (luma_pic_breadth > level_spec->max_luma_picture_breadth) { + level_spec->max_luma_picture_breadth = luma_pic_breadth; + } + // update compression_ratio level_spec->compression_ratio = (double)level_stats->total_uncompressed_size * cm->bit_depth / @@ -4386,6 +4982,87 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) { level_spec->max_col_tiles = (1 << cm->log2_tile_cols); } + + if (level_index >= 0 && level_constraint->fail_flag == 0) { + if (level_spec->max_luma_picture_size > + vp9_level_defs[level_index].max_luma_picture_size) { + level_constraint->fail_flag |= (1 << LUMA_PIC_SIZE_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]); + } + + if (level_spec->max_luma_picture_breadth > + vp9_level_defs[level_index].max_luma_picture_breadth) { + level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]); + } + + if ((double)level_spec->max_luma_sample_rate > + (double)vp9_level_defs[level_index].max_luma_sample_rate * + (1 + SAMPLE_RATE_GRACE_P)) { + level_constraint->fail_flag |= (1 << LUMA_SAMPLE_RATE_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_SAMPLE_RATE_TOO_LARGE]); + } + + if (level_spec->max_col_tiles > vp9_level_defs[level_index].max_col_tiles) { + level_constraint->fail_flag |= (1 << TOO_MANY_COLUMN_TILE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[TOO_MANY_COLUMN_TILE]); + } + + if (level_spec->min_altref_distance < + vp9_level_defs[level_index].min_altref_distance) { + level_constraint->fail_flag |= (1 << ALTREF_DIST_TOO_SMALL); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[ALTREF_DIST_TOO_SMALL]); + } + + if (level_spec->max_ref_frame_buffers > + vp9_level_defs[level_index].max_ref_frame_buffers) { + level_constraint->fail_flag |= (1 << TOO_MANY_REF_BUFFER); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[TOO_MANY_REF_BUFFER]); + } + + if (level_spec->max_cpb_size > vp9_level_defs[level_index].max_cpb_size) { + level_constraint->fail_flag |= (1 << CPB_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[CPB_TOO_LARGE]); + } + + // Set an upper bound for the next frame size. It will be used in + // level_rc_framerate() before encoding the next frame. + cpb_data_size = 0; + for (i = 0; i < CPB_WINDOW_SIZE - 1; ++i) { + if (i >= level_stats->frame_window_buffer.len) break; + idx = (level_stats->frame_window_buffer.start + + level_stats->frame_window_buffer.len - 1 - i) % + FRAME_WINDOW_SIZE; + cpb_data_size += level_stats->frame_window_buffer.buf[idx].size; + } + cpb_data_size = cpb_data_size / 125.0; + level_constraint->max_frame_size = + (int)((vp9_level_defs[level_index].max_cpb_size - cpb_data_size) * + 1000.0); + if (level_stats->frame_window_buffer.len < CPB_WINDOW_SIZE - 1) + level_constraint->max_frame_size >>= 1; + } } int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, @@ -4474,8 +5151,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1; #endif - - if ((oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) { +#if !CONFIG_REALTIME_ONLY + if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) && + (oxcf->arnr_strength > 0)) { int bitrate = cpi->rc.avg_frame_bandwidth / 40; int not_low_bitrate = bitrate > ALT_REF_AQ_LOW_BITRATE_BOUNDARY; @@ -4493,7 +5171,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, force_src_buffer = &cpi->alt_ref_buffer; } - +#endif cm->show_frame = 0; cm->intra_only = 0; cpi->refresh_alt_ref_frame = 1; @@ -4524,8 +5202,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->intra_only = 0; // if the flags indicate intra frame, but if the current picture is for // non-zero spatial layer, it should not be an intra picture. - // TODO(Won Kap): this needs to change if per-layer intra frame is - // allowed. if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) { source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF); @@ -4554,10 +5230,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } else { *size = 0; +#if !CONFIG_REALTIME_ONLY if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { vp9_end_first_pass(cpi); /* get last stats packet */ cpi->twopass.first_pass_done = 1; } +#endif // !CONFIG_REALTIME_ONLY return -1; } @@ -4604,6 +5282,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->frame_flags = *frame_flags; +#if !CONFIG_REALTIME_ONLY if ((oxcf->pass == 2) && (!cpi->use_svc || (is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state != ENCODING))) { @@ -4611,25 +5290,39 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } else if (oxcf->pass == 1) { set_frame_size(cpi); } +#endif // !CONFIG_REALTIME_ONLY + + if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 && + cpi->level_constraint.fail_flag == 0) + level_rc_framerate(cpi, arf_src_index); if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) { for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; } + cpi->td.mb.fp_src_pred = 0; +#if CONFIG_REALTIME_ONLY + if (cpi->use_svc) { + SvcEncode(cpi, size, dest, frame_flags); + } else { + // One pass encode + Pass0Encode(cpi, size, dest, frame_flags); + } +#else // !CONFIG_REALTIME_ONLY if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) { const int lossless = is_lossless_requested(oxcf); #if CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.use_highbitdepth) - cpi->td.mb.fwd_txm4x4 = + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; else - cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; - cpi->td.mb.highbd_itxm_add = + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; + cpi->td.mb.highbd_inv_txfm_add = lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add; #else - cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH - cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) { Pass2Encode(cpi, size, dest, frame_flags); @@ -4639,6 +5332,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // One pass encode Pass0Encode(cpi, size, dest, frame_flags); } +#endif // CONFIG_REALTIME_ONLY if (cm->refresh_frame_context) cm->frame_contexts[cm->frame_context_idx] = *cm->fc; @@ -4985,3 +5679,28 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) { vp9_update_entropy(cpi, 0); } } + +void vp9_set_row_mt(VP9_COMP *cpi) { + // Enable row based multi-threading for supported modes of encoding + cpi->row_mt = 0; + if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) && + cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) && + cpi->oxcf.row_mt && !cpi->use_svc) + cpi->row_mt = 1; + + if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 && + (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.row_mt && + !cpi->use_svc) + cpi->row_mt = 1; + + // In realtime mode, enable row based multi-threading for all the speed levels + // where non-rd path is used. + if (cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->oxcf.row_mt) { + cpi->row_mt = 1; + } + + if (cpi->row_mt) + cpi->row_mt_bit_exact = 1; + else + cpi->row_mt_bit_exact = 0; +} diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.h b/libs/libvpx/vp9/encoder/vp9_encoder.h index 66e41492b5..d723d93cbc 100644 --- a/libs/libvpx/vp9/encoder/vp9_encoder.h +++ b/libs/libvpx/vp9/encoder/vp9_encoder.h @@ -33,7 +33,9 @@ #include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_job_queue.h" #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_mbgraph.h" #include "vp9/encoder/vp9_mcomp.h" @@ -129,6 +131,16 @@ typedef enum { RESIZE_DYNAMIC = 2 // Coded size of each frame is determined by the codec. } RESIZE_TYPE; +typedef enum { + kInvalid = 0, + kLowSadLowSumdiff = 1, + kLowSadHighSumdiff = 2, + kHighSadLowSumdiff = 3, + kHighSadHighSumdiff = 4, + kLowVarHighSumdiff = 5, + kVeryHighSad = 6, +} CONTENT_STATE_SB; + typedef struct VP9EncoderConfig { BITSTREAM_PROFILE profile; vpx_bit_depth_t bit_depth; // Codec bit-depth. @@ -197,6 +209,7 @@ typedef struct VP9EncoderConfig { int two_pass_vbrbias; // two pass datarate control tweaks int two_pass_vbrmin_section; int two_pass_vbrmax_section; + int vbr_corpus_complexity; // 0 indicates corpus vbr disabled // END DATARATE CONTROL OPTIONS // ---------------------------------------------------------------- @@ -237,7 +250,7 @@ typedef struct VP9EncoderConfig { int max_threads; - int target_level; + unsigned int target_level; vpx_fixed_buf_t two_pass_stats_in; struct vpx_codec_pkt_list *output_pkt_list; @@ -256,6 +269,9 @@ typedef struct VP9EncoderConfig { int render_width; int render_height; VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode; + + int row_mt; + unsigned int motion_vector_unit_test; } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { @@ -267,14 +283,47 @@ typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; int mode_map[BLOCK_SIZES][MAX_MODES]; + FIRSTPASS_DATA fp_data; + VP9RowMTSync row_mt_sync; + + // Used for adaptive_rd_thresh with row multithreading + int *row_base_thresh_freq_fact; } TileDataEnc; +typedef struct RowMTInfo { + JobQueueHandle job_queue_hdl; +#if CONFIG_MULTITHREAD + pthread_mutex_t job_mutex; +#endif +} RowMTInfo; + +typedef struct { + TOKENEXTRA *start; + TOKENEXTRA *stop; + unsigned int count; +} TOKENLIST; + +typedef struct MultiThreadHandle { + int allocated_tile_rows; + int allocated_tile_cols; + int allocated_vert_unit_rows; + + // Frame level params + int num_tile_vert_sbs[MAX_NUM_TILE_ROWS]; + + // Job Queue structure and handles + JobQueue *job_queue; + + int jobs_per_tile_col; + + RowMTInfo row_mt_info[MAX_NUM_TILE_COLS]; + int thread_id_to_tile_id[MAX_NUM_THREADS]; // Mapping of threads to tiles +} MultiThreadHandle; + typedef struct RD_COUNTS { vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; int64_t comp_pred_diff[REFERENCE_MODES]; int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - int m_search_count; - int ex_search_count; } RD_COUNTS; typedef struct ThreadData { @@ -312,6 +361,7 @@ typedef struct IMAGE_STAT { typedef enum { LEVEL_UNKNOWN = 0, + LEVEL_AUTO = 1, LEVEL_1 = 10, LEVEL_1_1 = 11, LEVEL_2 = 20, @@ -333,6 +383,7 @@ typedef struct { VP9_LEVEL level; uint64_t max_luma_sample_rate; uint32_t max_luma_picture_size; + uint32_t max_luma_picture_breadth; double average_bitrate; // in kilobits per second double max_cpb_size; // in kilobits double compression_ratio; @@ -341,6 +392,8 @@ typedef struct { uint8_t max_ref_frame_buffers; } Vp9LevelSpec; +extern const Vp9LevelSpec vp9_level_defs[VP9_LEVELS]; + typedef struct { int64_t ts; // timestamp uint32_t luma_samples; @@ -368,6 +421,35 @@ typedef struct { Vp9LevelSpec level_spec; } Vp9LevelInfo; +typedef enum { + BITRATE_TOO_LARGE = 0, + LUMA_PIC_SIZE_TOO_LARGE, + LUMA_PIC_BREADTH_TOO_LARGE, + LUMA_SAMPLE_RATE_TOO_LARGE, + CPB_TOO_LARGE, + COMPRESSION_RATIO_TOO_SMALL, + TOO_MANY_COLUMN_TILE, + ALTREF_DIST_TOO_SMALL, + TOO_MANY_REF_BUFFER, + TARGET_LEVEL_FAIL_IDS +} TARGET_LEVEL_FAIL_ID; + +typedef struct { + int8_t level_index; + uint8_t rc_config_updated; + uint8_t fail_flag; + int max_frame_size; // in bits + double max_cpb_size; // in bits +} LevelConstraint; + +typedef struct ARNRFilterData { + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; + int strength; + int frame_count; + int alt_ref_index; + struct scale_factors sf; +} ARNRFilterData; + typedef struct VP9_COMP { QUANTS quants; ThreadData td; @@ -418,6 +500,7 @@ typedef struct VP9_COMP { TOKENEXTRA *tile_tok[4][1 << 6]; uint32_t tok_count[4][1 << 6]; + TOKENLIST *tplist[4][1 << 6]; // Ambient reconstruction err target for force key frames int64_t ambient_err; @@ -463,6 +546,8 @@ typedef struct VP9_COMP { uint8_t *segmentation_map; + uint8_t *skin_map; + // segment threashold for encode breakout int segment_encode_breakout[MAX_SEGMENTS]; @@ -470,7 +555,6 @@ typedef struct VP9_COMP { ActiveMap active_map; fractional_mv_step_fp *find_fractional_mv_step; - vp9_full_search_fn_t full_search_sad; vp9_diamond_search_fn_t diamond_search_sad; vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; uint64_t time_receive_data; @@ -571,7 +655,7 @@ typedef struct VP9_COMP { #endif int resize_pending; - int resize_state; + RESIZE_STATE resize_state; int external_resize; int resize_scale_num; int resize_scale_den; @@ -594,6 +678,8 @@ typedef struct VP9_COMP { int64_t vbp_thresholds[4]; int64_t vbp_threshold_minmax; int64_t vbp_threshold_sad; + // Threshold used for partition copy + int64_t vbp_threshold_copy; BLOCK_SIZE vbp_bsize_min; // Multi-threading @@ -601,9 +687,42 @@ typedef struct VP9_COMP { VPxWorker *workers; struct EncWorkerData *tile_thr_data; VP9LfSync lf_row_sync; + struct VP9BitstreamWorkerData *vp9_bitstream_worker_data; int keep_level_stats; Vp9LevelInfo level_info; + MultiThreadHandle multi_thread_ctxt; + void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int); + void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int); + ARNRFilterData arnr_filter_data; + + int row_mt; + unsigned int row_mt_bit_exact; + + // Previous Partition Info + BLOCK_SIZE *prev_partition; + int8_t *prev_segment_id; + // Used to save the status of whether a block has a low variance in + // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for + // 32x32, 9~24 for 16x16. + // This is for the last frame and is copied to the current frame + // when partition copy happens. + uint8_t *prev_variance_low; + uint8_t *copied_frame_cnt; + uint8_t max_copied_frame; + // If the last frame is dropped, we don't copy partition. + uint8_t last_frame_dropped; + + // For each superblock: keeps track of the last time (in frame distance) the + // the superblock did not have low source sad. + uint8_t *content_state_sb_fd; + + int compute_source_sad_onepass; + + LevelConstraint level_constraint; + + uint8_t *count_arf_frame_usage; + uint8_t *count_lastgolden_frame_usage; } VP9_COMP; void vp9_initialize_enc(void); @@ -702,6 +821,20 @@ static INLINE int allocated_tokens(TileInfo tile) { return get_token_alloc(tile_mb_rows, tile_mb_cols); } +static INLINE void get_start_tok(VP9_COMP *cpi, int tile_row, int tile_col, + int mi_row, TOKENEXTRA **tok) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1; + const int mb_row = (mi_row - tile_info->mi_row_start) >> 1; + + *tok = + cpi->tile_tok[tile_row][tile_col] + get_token_alloc(mb_row, tile_mb_cols); +} + int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); #if CONFIG_VP9_HIGHBITDEPTH int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, @@ -714,15 +847,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi); void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp); +YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2); -YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - int use_normative_scaler); +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler); void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); @@ -734,8 +866,18 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { return (cpi->use_svc && cpi->oxcf.pass == 0); } +#if CONFIG_VP9_TEMPORAL_DENOISING +static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { + return (!cpi->use_svc || + (cpi->use_svc && + cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); +} +#endif + +#define MIN_LOOKAHEAD_FOR_ARFS 4 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { - return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 && + return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) && + cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS && (cpi->oxcf.enable_auto_arf && (!is_two_pass_svc(cpi) || cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id])); @@ -758,10 +900,48 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) { return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL; } +static INLINE int get_num_vert_units(TileInfo tile, int shift) { + int num_vert_units = + (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift; + return num_vert_units; +} + +static INLINE int get_num_cols(TileInfo tile, int shift) { + int num_cols = + (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift; + return num_cols; +} + +static INLINE int get_level_index(VP9_LEVEL level) { + int i; + for (i = 0; i < VP9_LEVELS; ++i) { + if (level == vp9_level_defs[i].level) return i; + } + return -1; +} + +// Return the log2 value of max column tiles corresponding to the level that +// the picture size fits into. +static INLINE int log_tile_cols_from_picsize_level(uint32_t width, + uint32_t height) { + int i; + const uint32_t pic_size = width * height; + const uint32_t pic_breadth = VPXMAX(width, height); + for (i = LEVEL_1; i < LEVEL_MAX; ++i) { + if (vp9_level_defs[i].max_luma_picture_size >= pic_size && + vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { + return get_msb(vp9_level_defs[i].max_col_tiles); + } + } + return INT_MAX; +} + VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); void vp9_new_framerate(VP9_COMP *cpi, double framerate); +void vp9_set_row_mt(VP9_COMP *cpi); + #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) #ifdef __cplusplus diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.c b/libs/libvpx/vp9/encoder/vp9_ethread.c index 7657573bbf..0bd2e21451 100644 --- a/libs/libvpx/vp9/encoder/vp9_ethread.c +++ b/libs/libvpx/vp9/encoder/vp9_ethread.c @@ -11,6 +11,9 @@ #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_temporal_filter.h" #include "vpx_dsp/vpx_dsp_common.h" static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { @@ -30,13 +33,10 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { for (n = 0; n < ENTROPY_TOKENS; n++) td->rd_counts.coef_counts[i][j][k][l][m][n] += td_t->rd_counts.coef_counts[i][j][k][l][m][n]; - - // Counts of all motion searches and exhuastive mesh searches. - td->rd_counts.m_search_count += td_t->rd_counts.m_search_count; - td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count; } -static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -65,25 +65,29 @@ static int get_max_tile_cols(VP9_COMP *cpi) { vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); log2_tile_cols = clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + if (cpi->oxcf.target_level == LEVEL_AUTO) { + const int level_tile_cols = + log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height); + if (log2_tile_cols > level_tile_cols) { + log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols); + } + } return (1 << log2_tile_cols); } -void vp9_encode_tiles_mt(VP9_COMP *cpi) { +static void create_enc_workers(VP9_COMP *cpi, int num_workers) { VP9_COMMON *const cm = &cpi->common; - const int tile_cols = 1 << cm->log2_tile_cols; const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols); int i; - vp9_init_tile_data(cpi); - // Only run once to create threads and allocate thread data. if (cpi->num_workers == 0) { int allocated_workers = num_workers; // While using SVC, we need to allocate threads according to the highest - // resolution. - if (cpi->use_svc) { + // resolution. When row based multithreading is enabled, it is OK to + // allocate more threads than the number of max tile columns. + if (cpi->use_svc && !cpi->row_mt) { int max_tile_cols = get_max_tile_cols(cpi); allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols); } @@ -127,19 +131,57 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { thread_data->cpi = cpi; thread_data->td = &cpi->td; } - winterface->sync(worker); } } +} + +static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2, + int num_workers) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + int i; for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data; - - worker->hook = (VPxWorkerHook)enc_worker_hook; + worker->hook = hook; worker->data1 = &cpi->tile_thr_data[i]; - worker->data2 = NULL; - thread_data = (EncWorkerData *)worker->data1; + worker->data2 = data2; + } + + // Encode a frame + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Set the starting tile for each thread. + thread_data->start = i; + + if (i == cpi->num_workers - 1) + winterface->execute(worker); + else + winterface->launch(worker); + } + + // Encoding ends. + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + winterface->sync(worker); + } +} + +void vp9_encode_tiles_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols); + int i; + + vp9_init_tile_data(cpi); + + create_enc_workers(cpi, num_workers); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { @@ -169,25 +211,445 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { } } - // Encode a frame - for (i = 0; i < num_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - - // Set the starting tile for each thread. - thread_data->start = i; - - if (i == cpi->num_workers - 1) - winterface->execute(worker); - else - winterface->launch(worker); - } - - // Encoding ends. - for (i = 0; i < num_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - winterface->sync(worker); - } + launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers); + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Accumulate counters. + if (i < cpi->num_workers - 1) { + vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0); + accumulate_rd_opt(&cpi->td, thread_data->td); + } + } +} + +#if !CONFIG_REALTIME_ONLY +static void accumulate_fp_tile_stat(TileDataEnc *tile_data, + TileDataEnc *tile_data_t) { + tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor; + tile_data->fp_data.brightness_factor += + tile_data_t->fp_data.brightness_factor; + tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error; + tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error; + tile_data->fp_data.frame_noise_energy += + tile_data_t->fp_data.frame_noise_energy; + tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error; + tile_data->fp_data.intercount += tile_data_t->fp_data.intercount; + tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count; + tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count; + tile_data->fp_data.intra_count_low += tile_data_t->fp_data.intra_count_low; + tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high; + tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count; + tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount; + tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr; + tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs; + tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc; + tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs; + tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs; + tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs; + tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors; + tile_data->fp_data.intra_smooth_count += + tile_data_t->fp_data.intra_smooth_count; + tile_data->fp_data.image_data_start_row = + VPXMIN(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row) == INVALID_ROW + ? VPXMAX(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row) + : VPXMIN(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row); +} +#endif // !CONFIG_REALTIME_ONLY + +// Allocate memory for row synchronization +void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, + int rows) { + row_mt_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, + vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows)); + if (row_mt_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, row_mt_sync->cond_, + vpx_malloc(sizeof(*row_mt_sync->cond_) * rows)); + if (row_mt_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&row_mt_sync->cond_[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, row_mt_sync->cur_col, + vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows)); + + // Set up nsync. + row_mt_sync->sync_range = 1; +} + +// Deallocate row based multi-threading synchronization related mutex and data +void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) { + if (row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (row_mt_sync->mutex_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_mutex_destroy(&row_mt_sync->mutex_[i]); + } + vpx_free(row_mt_sync->mutex_); + } + if (row_mt_sync->cond_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_cond_destroy(&row_mt_sync->cond_[i]); + } + vpx_free(row_mt_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + vpx_free(row_mt_sync->cur_col); + // clear the structure as the source of this call may be dynamic change + // in tiles in which case this call will be followed by an _alloc() + // which may fail. + vp9_zero(*row_mt_sync); + } +} + +void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) { + pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) { + (void)row_mt_sync; + (void)r; + (void)c; + return; +} + +void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + int cur; + // Only signal when there are enough encoded blocks for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync != nsync - 1) sig = 0; + } else { + cur = cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&row_mt_sync->mutex_[r]); + + row_mt_sync->cur_col[r] = cur; + + pthread_cond_signal(&row_mt_sync->cond_[r]); + pthread_mutex_unlock(&row_mt_sync->mutex_[r]); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols) { + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; + return; +} + +#if !CONFIG_REALTIME_ONLY +static int first_pass_worker_hook(EncWorkerData *const thread_data, + MultiThreadHandle *multi_thread_ctxt) { + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + TileDataEnc *this_tile; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + FIRSTPASS_DATA fp_acc_data; + MV zero_mv = { 0, 0 }; + MV best_ref_mv; + int mb_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + + this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + mb_row = proc_job->vert_unit_row_num; + + best_ref_mv = zero_mv; + vp9_zero(fp_acc_data); + fp_acc_data.image_data_start_row = INVALID_ROW; + vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data, + this_tile, &best_ref_mv, mb_row); + } + } + return 0; +} + +void vp9_encode_fp_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + TileDataEnc *first_tile_col; + int num_workers = VPXMAX(cpi->oxcf.max_threads, 1); + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, FIRST_PASS_JOB); + + vp9_multi_thread_tile_init(cpi); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + } + } + + launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook, + multi_thread_ctxt, num_workers); + + first_tile_col = &cpi->tile_data[0]; + for (i = 1; i < tile_cols; i++) { + TileDataEnc *this_tile = &cpi->tile_data[i]; + accumulate_fp_tile_stat(first_tile_col, this_tile); + } +} + +static int temporal_filter_worker_hook(EncWorkerData *const thread_data, + MultiThreadHandle *multi_thread_ctxt) { + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + int mb_col_start, mb_col_end; + TileDataEnc *this_tile; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + int mb_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + mb_col_start = (this_tile->tile_info.mi_col_start) >> 1; + mb_col_end = (this_tile->tile_info.mi_col_end + 1) >> 1; + mb_row = proc_job->vert_unit_row_num; + + vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row, + mb_col_start, mb_col_end); + } + } + return 0; +} + +void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int num_workers = cpi->num_workers ? cpi->num_workers : 1; + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, ARNR_JOB); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + } + } + + launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook, + multi_thread_ctxt, num_workers); +} +#endif // !CONFIG_REALTIME_ONLY + +static int enc_row_mt_worker_hook(EncWorkerData *const thread_data, + MultiThreadHandle *multi_thread_ctxt) { + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + int mi_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE; + + vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row); + } + } + return 0; +} + +void vp9_encode_tiles_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int num_workers = VPXMAX(cpi->oxcf.max_threads, 1); + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, ENCODE_JOB); + + vp9_multi_thread_tile_init(cpi); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + } + if (thread_data->td->counts != &cpi->common.counts) { + memcpy(thread_data->td->counts, &cpi->common.counts, + sizeof(cpi->common.counts)); + } + + // Handle use_nonrd_pick_mode case. + if (cpi->sf.use_nonrd_pick_mode) { + MACROBLOCK *const x = &thread_data->td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none; + int j; + + for (j = 0; j < MAX_MB_PLANE; ++j) { + p[j].coeff = ctx->coeff_pbuf[j][0]; + p[j].qcoeff = ctx->qcoeff_pbuf[j][0]; + pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0]; + p[j].eobs = ctx->eobs_pbuf[j][0]; + } + } + } + + launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook, + multi_thread_ctxt, num_workers); for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.h b/libs/libvpx/vp9/encoder/vp9_ethread.h index 1efa4dcde2..a396e621d7 100644 --- a/libs/libvpx/vp9/encoder/vp9_ethread.h +++ b/libs/libvpx/vp9/encoder/vp9_ethread.h @@ -15,6 +15,10 @@ extern "C" { #endif +#define MAX_NUM_TILE_COLS (1 << 6) +#define MAX_NUM_TILE_ROWS 4 +#define MAX_NUM_THREADS 80 + struct VP9_COMP; struct ThreadData; @@ -22,10 +26,45 @@ typedef struct EncWorkerData { struct VP9_COMP *cpi; struct ThreadData *td; int start; + int thread_id; + int tile_completion_status[MAX_NUM_TILE_COLS]; } EncWorkerData; +// Encoder row synchronization +typedef struct VP9RowMTSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + // Allocate memory to store the sb/mb block index in each row. + int *cur_col; + int sync_range; + int rows; +} VP9RowMTSync; + void vp9_encode_tiles_mt(struct VP9_COMP *cpi); +void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi); + +void vp9_encode_fp_row_mt(struct VP9_COMP *cpi); + +void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c); +void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c); +void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +// Allocate memory for row based multi-threading synchronization. +void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm, + int rows); + +// Deallocate row based multi-threading synchronization related mutex and data. +void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync); + +void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.c b/libs/libvpx/vp9/encoder/vp9_firstpass.c index 1c716a1028..fb6b132a5b 100644 --- a/libs/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libs/libvpx/vp9/encoder/vp9_firstpass.c @@ -31,6 +31,7 @@ #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" @@ -40,20 +41,13 @@ #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 +#define COMPLEXITY_STATS_OUTPUT 0 -#define BOOST_BREAKOUT 12.5 -#define BOOST_FACTOR 12.5 -#define FACTOR_PT_LOW 0.70 -#define FACTOR_PT_HIGH 0.90 #define FIRST_PASS_Q 10.0 -#define GF_MAX_BOOST 96.0 #define INTRA_MODE_PENALTY 1024 -#define KF_MAX_BOOST 128.0 #define MIN_ARF_GF_BOOST 240 #define MIN_DECAY_FACTOR 0.01 -#define MIN_KF_BOOST 300 #define NEW_MV_MODE_PENALTY 32 -#define SVC_FACTOR_PT_LOW 0.45 #define DARK_THRESH 64 #define DEFAULT_GRP_WEIGHT 1.0 #define RC_FACTOR_MIN 0.75 @@ -109,18 +103,19 @@ static void output_stats(FIRSTPASS_STATS *stats, fpfile = fopen("firstpass.stt", "a"); fprintf(fpfile, - "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf" + "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" - "%12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf" + "%12.4lf" "\n", stats->frame, stats->weight, stats->intra_error, stats->coded_error, stats->sr_coded_error, stats->frame_noise_energy, stats->pcnt_inter, stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral, + stats->pcnt_intra_low, stats->pcnt_intra_high, stats->intra_skip_pct, stats->intra_smooth_pct, stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, - stats->MVcv, stats->mv_in_out_count, stats->new_mv_count, - stats->count, stats->duration); + stats->MVcv, stats->mv_in_out_count, stats->count, stats->duration); fclose(fpfile); } #endif @@ -150,6 +145,8 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->pcnt_neutral = 0.0; section->intra_skip_pct = 0.0; section->intra_smooth_pct = 0.0; + section->pcnt_intra_low = 0.0; + section->pcnt_intra_high = 0.0; section->inactive_zone_rows = 0.0; section->inactive_zone_cols = 0.0; section->MVr = 0.0; @@ -159,7 +156,6 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->MVrv = 0.0; section->MVcv = 0.0; section->mv_in_out_count = 0.0; - section->new_mv_count = 0.0; section->count = 0.0; section->duration = 1.0; section->spatial_layer_id = 0; @@ -180,6 +176,8 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->pcnt_neutral += frame->pcnt_neutral; section->intra_skip_pct += frame->intra_skip_pct; section->intra_smooth_pct += frame->intra_smooth_pct; + section->pcnt_intra_low += frame->pcnt_intra_low; + section->pcnt_intra_high += frame->pcnt_intra_high; section->inactive_zone_rows += frame->inactive_zone_rows; section->inactive_zone_cols += frame->inactive_zone_cols; section->MVr += frame->MVr; @@ -189,7 +187,6 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->MVrv += frame->MVrv; section->MVcv += frame->MVcv; section->mv_in_out_count += frame->mv_in_out_count; - section->new_mv_count += frame->new_mv_count; section->count += frame->count; section->duration += frame->duration; } @@ -208,6 +205,8 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->pcnt_neutral -= frame->pcnt_neutral; section->intra_skip_pct -= frame->intra_skip_pct; section->intra_smooth_pct -= frame->intra_smooth_pct; + section->pcnt_intra_low -= frame->pcnt_intra_low; + section->pcnt_intra_high -= frame->pcnt_intra_high; section->inactive_zone_rows -= frame->inactive_zone_rows; section->inactive_zone_cols -= frame->inactive_zone_cols; section->MVr -= frame->MVr; @@ -217,7 +216,6 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->MVrv -= frame->MVrv; section->MVcv -= frame->MVcv; section->mv_in_out_count -= frame->mv_in_out_count; - section->new_mv_count -= frame->new_mv_count; section->count -= frame->count; section->duration -= frame->duration; } @@ -237,17 +235,26 @@ static double calculate_active_area(const VP9_COMP *cpi, return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); } +// Get the average weighted error for the clip (or corpus) +static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) { + const double av_weight = + twopass->total_stats.weight / twopass->total_stats.count; + + if (cpi->oxcf.vbr_corpus_complexity) + return av_weight * twopass->mean_mod_score; + else + return (twopass->total_stats.coded_error * av_weight) / + twopass->total_stats.count; +} + +#define ACT_AREA_CORRECTION 0.5 // Calculate a modified Error used in distributing bits between easier and // harder frames. -#define ACT_AREA_CORRECTION 0.5 -static double calculate_modified_err(const VP9_COMP *cpi, - const TWO_PASS *twopass, - const VP9EncoderConfig *oxcf, - const FIRSTPASS_STATS *this_frame) { - const FIRSTPASS_STATS *const stats = &twopass->total_stats; - const double av_weight = stats->weight / stats->count; - const double av_err = (stats->coded_error * av_weight) / stats->count; - double modified_error = +static double calculate_mod_frame_score(const VP9_COMP *cpi, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame, + const double av_err) { + double modified_score = av_err * pow(this_frame->coded_error * this_frame->weight / DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0); @@ -257,11 +264,37 @@ static double calculate_modified_err(const VP9_COMP *cpi, // remaining active MBs. The correction here assumes that coding // 0.5N blocks of complexity 2X is a little easier than coding N // blocks of complexity X. - modified_error *= + modified_score *= pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION); - return fclamp(modified_error, twopass->modified_error_min, - twopass->modified_error_max); + return modified_score; +} + +static double calculate_norm_frame_score(const VP9_COMP *cpi, + const TWO_PASS *twopass, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame, + const double av_err) { + double modified_score = + av_err * pow(this_frame->coded_error * this_frame->weight / + DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + + const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0; + const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0; + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_score *= + pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION); + + // Normalize to a midpoint score. + modified_score /= DOUBLE_DIVIDE_CHECK(twopass->mean_mod_score); + + return fclamp(modified_score, min_score, max_score); } // This function returns the maximum target rate per frame. @@ -292,6 +325,9 @@ void vp9_end_first_pass(VP9_COMP *cpi) { } else { output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list); } + + vpx_free(cpi->twopass.fp_mb_float_stats); + cpi->twopass.fp_mb_float_stats = NULL; } static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { @@ -466,8 +502,8 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) { if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = thresh; break; - case VPX_BITS_10: ret_val = thresh >> 4; break; - case VPX_BITS_12: ret_val = thresh >> 8; break; + case VPX_BITS_10: ret_val = thresh << 4; break; + case VPX_BITS_12: ret_val = thresh << 8; break; default: assert(0 && "cm->bit_depth should be VPX_BITS_8, " @@ -652,39 +688,160 @@ static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) { return block_noise << 2; // Scale << 2 to account for sampling. } -#define INVALID_ROW -1 -void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { +// This function is called to test the functionality of row based +// multi-threading in unit tests for bit-exactness +static void accumulate_floating_point_stats(VP9_COMP *cpi, + TileDataEnc *first_tile_col) { + VP9_COMMON *const cm = &cpi->common; int mb_row, mb_col; - MACROBLOCK *const x = &cpi->td.mb; + first_tile_col->fp_data.intra_factor = 0; + first_tile_col->fp_data.brightness_factor = 0; + first_tile_col->fp_data.neutral_count = 0; + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + const int mb_index = mb_row * cm->mb_cols + mb_col; + first_tile_col->fp_data.intra_factor += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor; + first_tile_col->fp_data.brightness_factor += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor; + first_tile_col->fp_data.neutral_count += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count; + } + } +} + +static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, + FIRSTPASS_DATA *fp_acc_data) { + VP9_COMMON *const cm = &cpi->common; + // The minimum error here insures some bit allocation to frames even + // in static regions. The allocation per MB declines for larger formats + // where the typical "real" energy per MB also falls. + // Initial estimate here uses sqrt(mbs) to define the min_err, where the + // number of mbs is proportional to the image area. + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + const double min_err = 200 * sqrt(num_mbs); + + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) || + (fp_acc_data->image_data_start_row == INVALID_ROW)) { + fp_acc_data->image_data_start_row = cm->mb_rows / 2; + } + // Exclude any image dead zone + if (fp_acc_data->image_data_start_row > 0) { + fp_acc_data->intra_skip_count = + VPXMAX(0, + fp_acc_data->intra_skip_count - + (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); + } + + fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs; + fp_acc_data->brightness_factor = + fp_acc_data->brightness_factor / (double)num_mbs; + fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor; + + fps->frame = cm->current_video_frame; + fps->spatial_layer_id = cpi->svc.spatial_layer_id; + + fps->coded_error = + ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs; + fps->sr_coded_error = + ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs; + fps->intra_error = + ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs; + + fps->frame_noise_energy = + (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs; + fps->count = 1.0; + fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs; + fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs; + fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs; + fps->pcnt_intra_low = (double)(fp_acc_data->intra_count_low) / num_mbs; + fps->pcnt_intra_high = (double)(fp_acc_data->intra_count_high) / num_mbs; + fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs; + fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs; + fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row); + // Currently set to 0 as most issues relate to letter boxing. + fps->inactive_zone_cols = (double)0; + + if (fp_acc_data->mvcount > 0) { + fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount; + fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount; + fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount; + fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount; + fps->MVrv = ((double)(fp_acc_data->sum_mvrs) - + ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) / + fp_acc_data->mvcount)) / + fp_acc_data->mvcount; + fps->MVcv = ((double)(fp_acc_data->sum_mvcs) - + ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) / + fp_acc_data->mvcount)) / + fp_acc_data->mvcount; + fps->mv_in_out_count = + (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2); + fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs; + } else { + fps->MVr = 0.0; + fps->mvr_abs = 0.0; + fps->MVc = 0.0; + fps->mvc_abs = 0.0; + fps->MVrv = 0.0; + fps->MVcv = 0.0; + fps->mv_in_out_count = 0.0; + fps->pcnt_motion = 0.0; + } +} + +static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, + FIRSTPASS_DATA *fp_acc_data) { + this_tile->fp_data.intra_factor += fp_acc_data->intra_factor; + this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor; + this_tile->fp_data.coded_error += fp_acc_data->coded_error; + this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error; + this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy; + this_tile->fp_data.intra_error += fp_acc_data->intra_error; + this_tile->fp_data.intercount += fp_acc_data->intercount; + this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count; + this_tile->fp_data.neutral_count += fp_acc_data->neutral_count; + this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low; + this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high; + this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count; + this_tile->fp_data.mvcount += fp_acc_data->mvcount; + this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr; + this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs; + this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc; + this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs; + this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs; + this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs; + this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors; + this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count; + this_tile->fp_data.image_data_start_row = + VPXMIN(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row) == INVALID_ROW + ? VPXMAX(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row) + : VPXMIN(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row); +} + +void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, + FIRSTPASS_DATA *fp_acc_data, + TileDataEnc *tile_data, MV *best_ref_mv, + int mb_row) { + int mb_col; + MACROBLOCK *const x = &td->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - TileInfo tile; + TileInfo tile = tile_data->tile_info; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none; - int i; + const PICK_MODE_CONTEXT *ctx = &td->pc_root->none; + int i, c; + int num_mb_cols = get_num_cols(tile_data->tile_info, 1); int recon_yoffset, recon_uvoffset; - int64_t intra_error = 0; - int64_t coded_error = 0; - int64_t sr_coded_error = 0; - int64_t frame_noise_energy = 0; - - int sum_mvr = 0, sum_mvc = 0; - int sum_mvr_abs = 0, sum_mvc_abs = 0; - int64_t sum_mvrs = 0, sum_mvcs = 0; - int mvcount = 0; - int intercount = 0; - int second_ref_count = 0; const int intrapenalty = INTRA_MODE_PENALTY; - double neutral_count; - int intra_skip_count = 0; - int intra_smooth_count = 0; - int image_data_start_row = INVALID_ROW; - int new_mv_count = 0; - int sum_in_vectors = 0; - MV lastmv = { 0, 0 }; - TWO_PASS *twopass = &cpi->twopass; const MV zero_mv = { 0, 0 }; int recon_y_stride, recon_uv_stride, uv_mb_height; @@ -696,11 +853,537 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL; - double intra_factor; - double brightness_factor; - BufferPool *const pool = cm->buffer_pool; MODE_INFO mi_above, mi_left; + double mb_intra_factor; + double mb_brightness_factor; + double mb_neutral_count; + + // First pass code requires valid last and new frame buffers. + assert(new_yv12 != NULL); + assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); + + if (lc != NULL) { + // Use either last frame or alt frame for motion search. + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); + if (first_ref_buf == NULL) + first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); + } + + if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + if (gld_yv12 == NULL) { + gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + } else { + gld_yv12 = NULL; + } + } + + xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + + (tile.mi_col_start >> 1); + xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1); + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + p[i].eobs = ctx->eobs_pbuf[i][1]; + } + + recon_y_stride = new_yv12->y_stride; + recon_uv_stride = new_yv12->uv_stride; + uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); + + // Reset above block coeffs. + recon_yoffset = + (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16; + recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) + + (tile.mi_col_start >> 1) * uv_mb_height; + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.row_max = + ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; + + for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1); + ++mb_col, c++) { + int this_error; + int this_intra_error; + const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); + double log_intra; + int level_sample; + const int mb_index = mb_row * cm->mb_cols + mb_col; + +#if CONFIG_FP_MB_STATS + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif + + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c); + + // Adjust to the next column of MBs. + x->plane[0].src.buf = cpi->Source->y_buffer + + mb_row * 16 * x->plane[0].src.stride + mb_col * 16; + x->plane[1].src.buf = cpi->Source->u_buffer + + mb_row * uv_mb_height * x->plane[1].src.stride + + mb_col * uv_mb_height; + x->plane[2].src.buf = cpi->Source->v_buffer + + mb_row * uv_mb_height * x->plane[1].src.stride + + mb_col * uv_mb_height; + + vpx_clear_system_state(); + + xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; + xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; + xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize], + mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows, + cm->mi_cols); + // Are edges available for intra prediction? + // Since the firstpass does not populate the mi_grid_visible, + // above_mi/left_mi must be overwritten with a nonzero value when edges + // are available. Required by vp9_predict_intra_block(). + xd->above_mi = (mb_row != 0) ? &mi_above : NULL; + xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL; + + // Do intra 16x16 prediction. + x->skip_encode = 0; + x->fp_src_pred = 0; + // Do intra prediction based on source pixels for tile boundaries + if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) { + xd->left_mi = &mi_left; + x->fp_src_pred = 1; + } + xd->mi[0]->mode = DC_PRED; + xd->mi[0]->tx_size = + use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; + // Fix - zero the 16x16 block first. This ensures correct this_error for + // block sizes smaller than 16x16. + vp9_zero_array(x->plane[0].src_diff, 256); + vp9_encode_intra_block_plane(x, bsize, 0, 0); + this_error = vpx_get_mb_ss(x->plane[0].src_diff); + this_intra_error = this_error; + + // Keep a record of blocks that have very low intra error residual + // (i.e. are in effect completely flat and untextured in the intra + // domain). In natural videos this is uncommon, but it is much more + // common in animations, graphics and screen content, so may be used + // as a signal to detect these types of content. + if (this_error < get_ul_intra_threshold(cm)) { + ++(fp_acc_data->intra_skip_count); + } else if ((mb_col > 0) && + (fp_acc_data->image_data_start_row == INVALID_ROW)) { + fp_acc_data->image_data_start_row = mb_row; + } + + // Blocks that are mainly smooth in the intra domain. + // Some special accounting for CQ but also these are better for testing + // noise levels. + if (this_error < get_smooth_intra_threshold(cm)) { + ++(fp_acc_data->intra_smooth_count); + } + + // Special case noise measurement for first frame. + if (cm->current_video_frame == 0) { + if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: break; + case VPX_BITS_10: this_error >>= 4; break; + case VPX_BITS_12: this_error >>= 8; break; + default: + assert(0 && + "cm->bit_depth should be VPX_BITS_8, " + "VPX_BITS_10 or VPX_BITS_12"); + return; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + vpx_clear_system_state(); + log_intra = log(this_error + 1.0); + if (log_intra < 10.0) { + mb_intra_factor = 1.0 + ((10.0 - log_intra) * 0.05); + fp_acc_data->intra_factor += mb_intra_factor; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = + mb_intra_factor; + } else { + fp_acc_data->intra_factor += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; + else + level_sample = x->plane[0].src.buf[0]; +#else + level_sample = x->plane[0].src.buf[0]; +#endif + if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { + mb_brightness_factor = 1.0 + (0.01 * (DARK_THRESH - level_sample)); + fp_acc_data->brightness_factor += mb_brightness_factor; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = + mb_brightness_factor; + } else { + fp_acc_data->brightness_factor += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = + 1.0; + } + + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_error += intrapenalty; + + // Accumulate the intra error. + fp_acc_data->intra_error += (int64_t)this_error; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // initialization + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + } +#endif + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.col_max = + ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; + + // Other than for the first frame do a motion search. + if ((lc == NULL && cm->current_video_frame > 0) || + (lc != NULL && lc->current_video_frame_in_layer > 0)) { + int tmp_err, motion_error, raw_motion_error; + // Assume 0,0 motion with no mv overhead. + MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; + struct buf_2d unscaled_last_source_buf_2d; + + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { + motion_error = + get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + } +#else + motion_error = + get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is small. + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + recon_yoffset; + unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + raw_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); + } else { + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + } +#else + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // TODO(pengchong): Replace the hard-coded threshold + if (raw_motion_error > 25 || lc != NULL) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error); + + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if (!is_zero_mv(best_ref_mv)) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; + } + } + + // Search in an older reference frame. + if (((lc == NULL && cm->current_video_frame > 1) || + (lc != NULL && lc->current_video_frame_in_layer > 1)) && + gld_yv12 != NULL) { + // Assume 0,0 motion with no mv overhead. + int gf_motion_error; + + xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + gf_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); + } +#else + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); + + if (gf_motion_error < motion_error && gf_motion_error < this_error) + ++(fp_acc_data->second_ref_count); + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; + + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (gf_motion_error < this_error) + fp_acc_data->sr_coded_error += gf_motion_error; + else + fp_acc_data->sr_coded_error += this_error; + } else { + fp_acc_data->sr_coded_error += motion_error; + } + } else { + fp_acc_data->sr_coded_error += motion_error; + } + + // Start by assuming that intra mode is best. + best_ref_mv->row = 0; + best_ref_mv->col = 0; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // intra prediction statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; + } + } +#endif + + if (motion_error <= this_error) { + vpx_clear_system_state(); + + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_error - intrapenalty) * 9 <= motion_error * 10) && + (this_error < (2 * intrapenalty))) { + fp_acc_data->neutral_count += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = + 1.0; + // Also track cases where the intra is not much worse than the inter + // and use this in limiting the GF/arf group length. + } else if ((this_error > NCOUNT_INTRA_THRESH) && + (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) { + mb_neutral_count = + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); + fp_acc_data->neutral_count += mb_neutral_count; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = + mb_neutral_count; + } + + mv.row *= 8; + mv.col *= 8; + this_error = motion_error; + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = mv; + xd->mi[0]->tx_size = TX_4X4; + xd->mi[0]->ref_frame[0] = LAST_FRAME; + xd->mi[0]->ref_frame[1] = NONE; + vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); + vp9_encode_sby_pass1(x, bsize); + fp_acc_data->sum_mvr += mv.row; + fp_acc_data->sum_mvr_abs += abs(mv.row); + fp_acc_data->sum_mvc += mv.col; + fp_acc_data->sum_mvc_abs += abs(mv.col); + fp_acc_data->sum_mvrs += mv.row * mv.row; + fp_acc_data->sum_mvcs += mv.col * mv.col; + ++(fp_acc_data->intercount); + + *best_ref_mv = mv; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // inter prediction statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; + } + } +#endif + + if (!is_zero_mv(&mv)) { + ++(fp_acc_data->mvcount); + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK; + // check estimated motion direction + if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { + // right direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_RIGHT_MASK; + } else if (mv.as_mv.row < 0 && + abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { + // up direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK; + } else if (mv.as_mv.col < 0 && + abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { + // left direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_LEFT_MASK; + } else { + // down direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_DOWN_MASK; + } + } +#endif + + // Does the row vector point inwards or outwards? + if (mb_row < cm->mb_rows / 2) { + if (mv.row > 0) + --(fp_acc_data->sum_in_vectors); + else if (mv.row < 0) + ++(fp_acc_data->sum_in_vectors); + } else if (mb_row > cm->mb_rows / 2) { + if (mv.row > 0) + ++(fp_acc_data->sum_in_vectors); + else if (mv.row < 0) + --(fp_acc_data->sum_in_vectors); + } + + // Does the col vector point inwards or outwards? + if (mb_col < cm->mb_cols / 2) { + if (mv.col > 0) + --(fp_acc_data->sum_in_vectors); + else if (mv.col < 0) + ++(fp_acc_data->sum_in_vectors); + } else if (mb_col > cm->mb_cols / 2) { + if (mv.col > 0) + ++(fp_acc_data->sum_in_vectors); + else if (mv.col < 0) + --(fp_acc_data->sum_in_vectors); + } + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + } else { // 0,0 mv but high error + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } + } else { // Intra < inter error + int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); + if (this_intra_error < scaled_low_intra_thresh) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + if (motion_error < scaled_low_intra_thresh) { + fp_acc_data->intra_count_low += 1.0; + } else { + fp_acc_data->intra_count_high += 1.0; + } + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + fp_acc_data->intra_count_high += 1.0; + } + } + } else { + fp_acc_data->sr_coded_error += (int64_t)this_error; + } + fp_acc_data->coded_error += (int64_t)this_error; + + recon_yoffset += 16; + recon_uvoffset += uv_mb_height; + + // Accumulate row level stats to the corresponding tile stats + if (cpi->row_mt && mb_col == (tile.mi_col_end >> 1) - 1) + accumulate_fp_mb_row_stat(tile_data, fp_acc_data); + + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c, + num_mb_cols); + } + vpx_clear_system_state(); +} + +static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { + VP9_COMMON *const cm = &cpi->common; + int mb_row; + TileDataEnc tile_data; + TileInfo *tile = &tile_data.tile_info; + MV zero_mv = { 0, 0 }; + MV best_ref_mv; + // Tiling is ignored in the first pass. + vp9_tile_init(tile, cm, 0, 0); + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + best_ref_mv = zero_mv; + vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data, + &best_ref_mv, mb_row); + } +} + +void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { + MACROBLOCK *const x = &cpi->td.mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TWO_PASS *twopass = &cpi->twopass; + + YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); + const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; + + LAYER_CONTEXT *const lc = + is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] + : NULL; + BufferPool *const pool = cm->buffer_pool; + + FIRSTPASS_DATA fp_temp_data; + FIRSTPASS_DATA *fp_acc_data = &fp_temp_data; + + vpx_clear_system_state(); + vp9_zero(fp_temp_data); + fp_acc_data->image_data_start_row = INVALID_ROW; + // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); @@ -711,12 +1394,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { } #endif - vpx_clear_system_state(); - - intra_factor = 0.0; - brightness_factor = 0.0; - neutral_count = 0.0; - set_first_pass_params(cpi); vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth)); @@ -761,7 +1438,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, - &cpi->scaled_source, 0); + &cpi->scaled_source, 0, EIGHTTAP, 0); } vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); @@ -778,501 +1455,39 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vp9_frame_init_quantizer(cpi); - for (i = 0; i < MAX_MB_PLANE; ++i) { - p[i].coeff = ctx->coeff_pbuf[i][1]; - p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; - pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; - p[i].eobs = ctx->eobs_pbuf[i][1]; - } x->skip_recode = 0; vp9_init_mv_probs(cm); vp9_initialize_rd_consts(cpi); - // Tiling is ignored in the first pass. - vp9_tile_init(&tile, cm, 0, 0); + cm->log2_tile_rows = 0; - recon_y_stride = new_yv12->y_stride; - recon_uv_stride = new_yv12->uv_stride; - uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); - - for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { - MV best_ref_mv = { 0, 0 }; - - // Reset above block coeffs. - recon_yoffset = (mb_row * recon_y_stride * 16); - recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height); - - // Set up limit values for motion vectors to prevent them extending - // outside the UMV borders. - x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); - x->mv_limits.row_max = - ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; - - for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { - int this_error; - int this_intra_error; - const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); - const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); - double log_intra; - int level_sample; - -#if CONFIG_FP_MB_STATS - const int mb_index = mb_row * cm->mb_cols + mb_col; -#endif - - vpx_clear_system_state(); - - xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; - xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; - xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; - xd->mi[0]->sb_type = bsize; - xd->mi[0]->ref_frame[0] = INTRA_FRAME; - set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize], - mb_col << 1, num_8x8_blocks_wide_lookup[bsize], - cm->mi_rows, cm->mi_cols); - // Are edges available for intra prediction? - // Since the firstpass does not populate the mi_grid_visible, - // above_mi/left_mi must be overwritten with a nonzero value when edges - // are available. Required by vp9_predict_intra_block(). - xd->above_mi = (mb_row != 0) ? &mi_above : NULL; - xd->left_mi = (mb_col > tile.mi_col_start) ? &mi_left : NULL; - - // Do intra 16x16 prediction. - x->skip_encode = 0; - xd->mi[0]->mode = DC_PRED; - xd->mi[0]->tx_size = - use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; - vp9_encode_intra_block_plane(x, bsize, 0, 0); - this_error = vpx_get_mb_ss(x->plane[0].src_diff); - this_intra_error = this_error; - - // Keep a record of blocks that have very low intra error residual - // (i.e. are in effect completely flat and untextured in the intra - // domain). In natural videos this is uncommon, but it is much more - // common in animations, graphics and screen content, so may be used - // as a signal to detect these types of content. - if (this_error < get_ul_intra_threshold(cm)) { - ++intra_skip_count; - } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) { - image_data_start_row = mb_row; - } - - // Blocks that are mainly smooth in the intra domain. - // Some special accounting for CQ but also these are better for testing - // noise levels. - if (this_error < get_smooth_intra_threshold(cm)) { - ++intra_smooth_count; - } - - // Special case noise measurement for first frame. - if (cm->current_video_frame == 0) { - if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { - frame_noise_energy += fp_estimate_block_noise(x, bsize); - } else { - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } - } - -#if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) { - switch (cm->bit_depth) { - case VPX_BITS_8: break; - case VPX_BITS_10: this_error >>= 4; break; - case VPX_BITS_12: this_error >>= 8; break; - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); - return; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH - - vpx_clear_system_state(); - log_intra = log(this_error + 1.0); - if (log_intra < 10.0) - intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); - else - intra_factor += 1.0; - -#if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) - level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; - else - level_sample = x->plane[0].src.buf[0]; -#else - level_sample = x->plane[0].src.buf[0]; -#endif - if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) - brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); - else - brightness_factor += 1.0; - - // Intrapenalty below deals with situations where the intra and inter - // error scores are very low (e.g. a plain black frame). - // We do not have special cases in first pass for 0,0 and nearest etc so - // all inter modes carry an overhead cost estimate for the mv. - // When the error score is very low this causes us to pick all or lots of - // INTRA modes and throw lots of key frames. - // This penalty adds a cost matching that of a 0,0 mv to the intra case. - this_error += intrapenalty; - - // Accumulate the intra error. - intra_error += (int64_t)this_error; - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // initialization - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - } -#endif - - // Set up limit values for motion vectors to prevent them extending - // outside the UMV borders. - x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); - x->mv_limits.col_max = - ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; - - // Other than for the first frame do a motion search. - if ((lc == NULL && cm->current_video_frame > 0) || - (lc != NULL && lc->current_video_frame_in_layer > 0)) { - int tmp_err, motion_error, raw_motion_error; - // Assume 0,0 motion with no mv overhead. - MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; - struct buf_2d unscaled_last_source_buf_2d; - - xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); - } else { - motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); - } -#else - motion_error = - get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); -#endif // CONFIG_VP9_HIGHBITDEPTH - - // Compute the motion error of the 0,0 motion using the last source - // frame as the reference. Skip the further motion search on - // reconstructed frame if this error is small. - unscaled_last_source_buf_2d.buf = - cpi->unscaled_last_source->y_buffer + recon_yoffset; - unscaled_last_source_buf_2d.stride = - cpi->unscaled_last_source->y_stride; -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - raw_motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); - } else { - raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &unscaled_last_source_buf_2d); - } -#else - raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &unscaled_last_source_buf_2d); -#endif // CONFIG_VP9_HIGHBITDEPTH - - // TODO(pengchong): Replace the hard-coded threshold - if (raw_motion_error > 25 || lc != NULL) { - // Test last reference frame using the previous best mv as the - // starting point (best reference) for the search. - first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error); - - // If the current best reference mv is not centered on 0,0 then do a - // 0,0 based search as well. - if (!is_zero_mv(&best_ref_mv)) { - tmp_err = INT_MAX; - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); - - if (tmp_err < motion_error) { - motion_error = tmp_err; - mv = tmp_mv; - } - } - - // Search in an older reference frame. - if (((lc == NULL && cm->current_video_frame > 1) || - (lc != NULL && lc->current_video_frame_in_layer > 1)) && - gld_yv12 != NULL) { - // Assume 0,0 motion with no mv overhead. - int gf_motion_error; - - xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - gf_motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); - } else { - gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); - } -#else - gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); -#endif // CONFIG_VP9_HIGHBITDEPTH - - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, - &gf_motion_error); - - if (gf_motion_error < motion_error && gf_motion_error < this_error) - ++second_ref_count; - - // Reset to last frame as reference buffer. - xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; - xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; - xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; - - // In accumulating a score for the older reference frame take the - // best of the motion predicted score and the intra coded error - // (just as will be done for) accumulation of "coded_error" for - // the last frame. - if (gf_motion_error < this_error) - sr_coded_error += gf_motion_error; - else - sr_coded_error += this_error; - } else { - sr_coded_error += motion_error; - } - } else { - sr_coded_error += motion_error; - } - - // Start by assuming that intra mode is best. - best_ref_mv.row = 0; - best_ref_mv.col = 0; - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // intra prediction statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; - } - } -#endif - - if (motion_error <= this_error) { - vpx_clear_system_state(); - - // Keep a count of cases where the inter and intra were very close - // and very low. This helps with scene cut detection for example in - // cropped clips with black bars at the sides or top and bottom. - if (((this_error - intrapenalty) * 9 <= motion_error * 10) && - (this_error < (2 * intrapenalty))) { - neutral_count += 1.0; - // Also track cases where the intra is not much worse than the inter - // and use this in limiting the GF/arf group length. - } else if ((this_error > NCOUNT_INTRA_THRESH) && - (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) { - neutral_count += - (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); - } - - mv.row *= 8; - mv.col *= 8; - this_error = motion_error; - xd->mi[0]->mode = NEWMV; - xd->mi[0]->mv[0].as_mv = mv; - xd->mi[0]->tx_size = TX_4X4; - xd->mi[0]->ref_frame[0] = LAST_FRAME; - xd->mi[0]->ref_frame[1] = NONE; - vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); - vp9_encode_sby_pass1(x, bsize); - sum_mvr += mv.row; - sum_mvr_abs += abs(mv.row); - sum_mvc += mv.col; - sum_mvc_abs += abs(mv.col); - sum_mvrs += mv.row * mv.row; - sum_mvcs += mv.col * mv.col; - ++intercount; - - best_ref_mv = mv; - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // inter prediction statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_ERROR_SMALL_MASK; - } - } -#endif - - if (!is_zero_mv(&mv)) { - ++mvcount; - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - cpi->twopass.frame_mb_stats_buf[mb_index] &= - ~FPMB_MOTION_ZERO_MASK; - // check estimated motion direction - if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { - // right direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_RIGHT_MASK; - } else if (mv.as_mv.row < 0 && - abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { - // up direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_UP_MASK; - } else if (mv.as_mv.col < 0 && - abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { - // left direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_LEFT_MASK; - } else { - // down direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_DOWN_MASK; - } - } -#endif - - // Non-zero vector, was it different from the last non zero vector? - if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count; - lastmv = mv; - - // Does the row vector point inwards or outwards? - if (mb_row < cm->mb_rows / 2) { - if (mv.row > 0) - --sum_in_vectors; - else if (mv.row < 0) - ++sum_in_vectors; - } else if (mb_row > cm->mb_rows / 2) { - if (mv.row > 0) - ++sum_in_vectors; - else if (mv.row < 0) - --sum_in_vectors; - } - - // Does the col vector point inwards or outwards? - if (mb_col < cm->mb_cols / 2) { - if (mv.col > 0) - --sum_in_vectors; - else if (mv.col < 0) - ++sum_in_vectors; - } else if (mb_col > cm->mb_cols / 2) { - if (mv.col > 0) - ++sum_in_vectors; - else if (mv.col < 0) - --sum_in_vectors; - } - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { - frame_noise_energy += fp_estimate_block_noise(x, bsize); - } else { // 0,0 mv but high error - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } - } else { // Intra < inter error - if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) - frame_noise_energy += fp_estimate_block_noise(x, bsize); - else - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } - } else { - sr_coded_error += (int64_t)this_error; - } - coded_error += (int64_t)this_error; - - // Adjust to the next column of MBs. - x->plane[0].src.buf += 16; - x->plane[1].src.buf += uv_mb_height; - x->plane[2].src.buf += uv_mb_height; - - recon_yoffset += 16; - recon_uvoffset += uv_mb_height; - } - - // Adjust to the next row of MBs. - x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; - x->plane[1].src.buf += - uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; - x->plane[2].src.buf += - uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; - - vpx_clear_system_state(); - } - - // Clamp the image start to rows/2. This number of rows is discarded top - // and bottom as dead data so rows / 2 means the frame is blank. - if ((image_data_start_row > cm->mb_rows / 2) || - (image_data_start_row == INVALID_ROW)) { - image_data_start_row = cm->mb_rows / 2; - } - // Exclude any image dead zone - if (image_data_start_row > 0) { - intra_skip_count = - VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2)); - } + if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL) + CHECK_MEM_ERROR( + cm, cpi->twopass.fp_mb_float_stats, + vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1)); { FIRSTPASS_STATS fps; - // The minimum error here insures some bit allocation to frames even - // in static regions. The allocation per MB declines for larger formats - // where the typical "real" energy per MB also falls. - // Initial estimate here uses sqrt(mbs) to define the min_err, where the - // number of mbs is proportional to the image area. - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cpi->common.MBs; - const double min_err = 200 * sqrt(num_mbs); - - intra_factor = intra_factor / (double)num_mbs; - brightness_factor = brightness_factor / (double)num_mbs; - fps.weight = intra_factor * brightness_factor; - - fps.frame = cm->current_video_frame; - fps.spatial_layer_id = cpi->svc.spatial_layer_id; - fps.coded_error = (double)(coded_error >> 8) + min_err; - fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err; - fps.intra_error = (double)(intra_error >> 8) + min_err; - fps.frame_noise_energy = (double)frame_noise_energy / (double)num_mbs; - fps.count = 1.0; - fps.pcnt_inter = (double)intercount / num_mbs; - fps.pcnt_second_ref = (double)second_ref_count / num_mbs; - fps.pcnt_neutral = (double)neutral_count / num_mbs; - fps.intra_skip_pct = (double)intra_skip_count / num_mbs; - fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs; - fps.inactive_zone_rows = (double)image_data_start_row; - // Currently set to 0 as most issues relate to letter boxing. - fps.inactive_zone_cols = (double)0; - - if (mvcount > 0) { - fps.MVr = (double)sum_mvr / mvcount; - fps.mvr_abs = (double)sum_mvr_abs / mvcount; - fps.MVc = (double)sum_mvc / mvcount; - fps.mvc_abs = (double)sum_mvc_abs / mvcount; - fps.MVrv = - ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount; - fps.MVcv = - ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount; - fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); - fps.new_mv_count = new_mv_count; - fps.pcnt_motion = (double)mvcount / num_mbs; + TileDataEnc *first_tile_col; + if (!cpi->row_mt) { + cm->log2_tile_cols = 0; + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; + first_pass_encode(cpi, fp_acc_data); + first_pass_stat_calc(cpi, &fps, fp_acc_data); } else { - fps.MVr = 0.0; - fps.mvr_abs = 0.0; - fps.MVc = 0.0; - fps.mvc_abs = 0.0; - fps.MVrv = 0.0; - fps.MVcv = 0.0; - fps.mv_in_out_count = 0.0; - fps.new_mv_count = 0.0; - fps.pcnt_motion = 0.0; + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; + if (cpi->row_mt_bit_exact) { + cm->log2_tile_cols = 0; + vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs); + } + vp9_encode_fp_row_mt(cpi); + first_tile_col = &cpi->tile_data[0]; + if (cpi->row_mt_bit_exact) + accumulate_floating_point_stats(cpi, first_tile_col); + first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data)); } // Dont allow a value of 0 for duration. @@ -1345,14 +1560,22 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); } -static double calc_correction_factor(double err_per_mb, double err_divisor, - double pt_low, double pt_high, int q, - vpx_bit_depth_t bit_depth) { - const double error_term = err_per_mb / err_divisor; +static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { + 0.65, 0.70, 0.75, 0.85, 0.90, 0.90, 0.90, 1.00, 1.25 +}; - // Adjustment based on actual quantizer to power term. - const double power_term = - VPXMIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high); +static double calc_correction_factor(double err_per_mb, double err_divisor, + int q) { + const double error_term = err_per_mb / DOUBLE_DIVIDE_CHECK(err_divisor); + const int index = q >> 5; + double power_term; + + assert((index >= 0) && (index < (QINDEX_RANGE >> 5))); + + // Adjustment based on quantizer to the power term. + power_term = + q_pow_term[index] + + (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); // Calculate correction factor. if (power_term < 1.0) assert(error_term >= 0.0); @@ -1369,6 +1592,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; TWO_PASS *const twopass = &cpi->twopass; + double last_group_rate_err; // Clamp the target rate to VBR min / max limts. const int target_rate = @@ -1377,24 +1601,35 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX); inactive_zone = fclamp(inactive_zone, 0.0, 1.0); +// TODO(jimbankoski): remove #if here or below when this has been +// well tested. +#if CONFIG_ALWAYS_ADJUST_BPM + // based on recent history adjust expectations of bits per macroblock. + last_group_rate_err = + (double)twopass->rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); + last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); + twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; + twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); +#endif + if (target_rate <= 0) { return rc->worst_quality; // Highest value allowed } else { const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.MBs; - const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); - const double av_err_per_mb = section_err / active_mbs; + const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone); + const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct); + const double av_err_per_mb = section_err / active_pct; const double speed_term = 1.0 + 0.04 * oxcf->speed; - double last_group_rate_err; const int target_norm_bits_per_mb = (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs); int q; - int is_svc_upper_layer = 0; - - if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) - is_svc_upper_layer = 1; +// TODO(jimbankoski): remove #if here or above when this has been +// well tested. +#if !CONFIG_ALWAYS_ADJUST_BPM // based on recent history adjust expectations of bits per macroblock. last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits / @@ -1402,14 +1637,13 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); +#endif // Try and pick a max Q that will be high enough to encode the // content at the given rate. for (q = rc->best_quality; q < rc->worst_quality; ++q) { - const double factor = calc_correction_factor( - av_err_per_mb, ERR_DIVISOR, - is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW, - FACTOR_PT_HIGH, q, cpi->common.bit_depth); + const double factor = + calc_correction_factor(av_err_per_mb, ERR_DIVISOR, q); const int bits_per_mb = vp9_rc_bits_per_mb( INTER_FRAME, q, factor * speed_term * cpi->twopass.bpm_factor * noise_factor, @@ -1457,7 +1691,7 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width, void vp9_init_second_pass(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; + VP9EncoderConfig *const oxcf = &cpi->oxcf; const int is_two_pass_svc = (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); RATE_CONTROL *const rc = &cpi->rc; @@ -1477,6 +1711,63 @@ void vp9_init_second_pass(VP9_COMP *cpi) { *stats = *twopass->stats_in_end; twopass->total_left_stats = *stats; + // Scan the first pass file and calculate a modified score for each + // frame that is used to distribute bits. The modified score is assumed + // to provide a linear basis for bit allocation. I.e a frame A with a score + // that is double that of frame B will be allocated 2x as many bits. + { + double modified_score_total = 0.0; + const FIRSTPASS_STATS *s = twopass->stats_in; + double av_err; + + if (oxcf->vbr_corpus_complexity) { + twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0; + av_err = get_distribution_av_err(cpi, twopass); + } else { + av_err = get_distribution_av_err(cpi, twopass); + // The first scan is unclamped and gives a raw average. + while (s < twopass->stats_in_end) { + modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err); + ++s; + } + + // The average error from this first scan is used to define the midpoint + // error for the rate distribution function. + twopass->mean_mod_score = + modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count); + } + + // Second scan using clamps based on the previous cycle average. + // This may modify the total and average somewhat but we dont bother with + // further itterations. + modified_score_total = 0.0; + s = twopass->stats_in; + while (s < twopass->stats_in_end) { + modified_score_total += + calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err); + ++s; + } + twopass->normalized_score_left = modified_score_total; + + // If using Corpus wide VBR mode then update the clip target bandwidth to + // reflect how the clip compares to the rest of the corpus. + if (oxcf->vbr_corpus_complexity) { + oxcf->target_bandwidth = + (int64_t)((double)oxcf->target_bandwidth * + (twopass->normalized_score_left / stats->count)); + } + +#if COMPLEXITY_STATS_OUTPUT + { + FILE *compstats; + compstats = fopen("complexity_stats.stt", "a"); + fprintf(compstats, "%10.3lf\n", + twopass->normalized_score_left / stats->count); + fclose(compstats); + } +#endif + } + frame_rate = 10000000.0 * stats->count / stats->duration; // Each frame can have a different duration, as the frame rate in the source // isn't guaranteed to be constant. The frame rate prior to the first frame @@ -1499,24 +1790,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; - // Scan the first pass file and calculate a modified total error based upon - // the bias/power function used to allocate bits. - { - const double avg_error = - stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); - const FIRSTPASS_STATS *s = twopass->stats_in; - double modified_error_total = 0.0; - twopass->modified_error_min = - (avg_error * oxcf->two_pass_vbrmin_section) / 100; - twopass->modified_error_max = - (avg_error * oxcf->two_pass_vbrmax_section) / 100; - while (s < twopass->stats_in_end) { - modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s); - ++s; - } - twopass->modified_error_left = modified_error_total; - } - // Reset the vbr bits off target counters rc->vbr_bits_off_target = 0; rc->vbr_bits_off_target_fast = 0; @@ -1551,9 +1824,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { static double get_sr_decay_rate(const VP9_COMP *cpi, const FIRSTPASS_STATS *frame) { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs; + double sr_diff = (frame->sr_coded_error - frame->coded_error); double sr_decay = 1.0; double modified_pct_inter; double modified_pcnt_intra; @@ -1562,10 +1833,11 @@ static double get_sr_decay_rate(const VP9_COMP *cpi, (cpi->initial_height + cpi->initial_width)); modified_pct_inter = frame->pcnt_inter; - if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) && + if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < (double)NCOUNT_FRAME_II_THRESH)) { - modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; + modified_pct_inter = + frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral; } modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); @@ -1574,7 +1846,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi, sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part - (INTRA_PART * modified_pcnt_intra); } - return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter)); + return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT); } // This function gives an estimate of how badly we believe the prediction @@ -1675,38 +1947,63 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, } } -#define BASELINE_ERR_PER_MB 1000.0 +#define BASELINE_ERR_PER_MB 12500.0 +#define GF_MAX_BOOST 96.0 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, - double this_frame_mv_in_out, double max_boost) { + double this_frame_mv_in_out) { double frame_boost; const double lq = vp9_convert_qindex_to_q( cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5); - int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - - // Correct for any inactive region in the image - num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); + const double active_area = calculate_active_area(cpi, this_frame); // Underlying boost factor is based on inter error ratio. - frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / + frame_boost = (BASELINE_ERR_PER_MB * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); - frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; - // Increase boost for frames where new data coming into frame (e.g. zoom out). - // Slightly reduce boost if there is a net balance of motion out of the frame - // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. + // Small adjustment for cases where there is a zoom out if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // In the extreme case the boost is halved. - else - frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + + // Q correction and scalling + frame_boost = frame_boost * boost_q_correction; + + return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction); +} + +#define KF_BASELINE_ERR_PER_MB 12500.0 +static double calc_kf_frame_boost(VP9_COMP *cpi, + const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, + double this_frame_mv_in_out, + double max_boost) { + double frame_boost; + const double lq = vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); + const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00); + const double active_area = calculate_active_area(cpi, this_frame); + + // Underlying boost factor is based on inter error ratio. + frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); + *sr_accumulator = VPXMAX(0.0, *sr_accumulator); + + // Small adjustment for cases where there is a zoom out + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + + // Q correction and scalling + frame_boost = frame_boost * boost_q_correction; return VPXMIN(frame_boost, max_boost * boost_q_correction); } -static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, - int *f_boost, int *b_boost) { +static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) { TWO_PASS *const twopass = &cpi->twopass; int i; double boost_score = 0.0; @@ -1720,7 +2017,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, // Search forward from the proposed arf/next gf position. for (i = 0; i < f_frames; ++i) { - const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i); if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. @@ -1730,8 +2027,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. - flash_detected = detect_flash(twopass, i + offset) || - detect_flash(twopass, i + offset + 1); + flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1); // Accumulate the effect of prediction quality decay. if (!flash_detected) { @@ -1740,13 +2036,11 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, ? MIN_DECAY_FACTOR : decay_accumulator; } - - boost_score += - decay_accumulator * - calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + boost_score += decay_accumulator * + calc_frame_boost(cpi, this_frame, this_frame_mv_in_out); } - *f_boost = (int)boost_score; + arf_boost = (int)boost_score; // Reset for backward looking loop. boost_score = 0.0; @@ -1758,7 +2052,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, // Search backward towards last gf position. for (i = -1; i >= -b_frames; --i) { - const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i); if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. @@ -1768,8 +2062,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, // We want to discount the the flash frame itself and the recovery // frame that follows as both will have poor scores. - flash_detected = detect_flash(twopass, i + offset) || - detect_flash(twopass, i + offset + 1); + flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1); // Cumulative effect of prediction quality decay. if (!flash_detected) { @@ -1778,16 +2071,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, ? MIN_DECAY_FACTOR : decay_accumulator; } - - boost_score += - decay_accumulator * - calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + boost_score += decay_accumulator * + calc_frame_boost(cpi, this_frame, this_frame_mv_in_out); } - *b_boost = (int)boost_score; + arf_boost += (int)boost_score; - arf_boost = (*f_boost + *b_boost); - if (arf_boost < ((b_frames + f_frames) * 20)) - arf_boost = ((b_frames + f_frames) * 20); + if (arf_boost < ((b_frames + f_frames) * 40)) + arf_boost = ((b_frames + f_frames) * 40); arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST); return arf_boost; @@ -1821,7 +2111,7 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, int64_t total_group_bits; // Calculate the bits to be allocated to the group as a whole. - if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) { total_group_bits = (int64_t)(twopass->kf_group_bits * (gf_group_err / twopass->kf_group_error_left)); } else { @@ -1829,10 +2119,11 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, } // Clamp odd edge cases. - total_group_bits = - (total_group_bits < 0) ? 0 : (total_group_bits > twopass->kf_group_bits) - ? twopass->kf_group_bits - : total_group_bits; + total_group_bits = (total_group_bits < 0) + ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; // Clip based on user supplied data rate variability limit. if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) @@ -1847,7 +2138,7 @@ static int calculate_boost_bits(int frame_count, int boost, int allocation_chunks; // return 0 for invalid inputs (could arise e.g. through rounding errors) - if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0; + if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0; allocation_chunks = (frame_count * 100) + boost; @@ -1875,8 +2166,33 @@ static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { arf_buffer_indices[1] = ARF_SLOT2; } +// Used in corpus vbr: Calculates the total normalized group complexity score +// for a given number of frames starting at the current position in the stats +// file. +static double calculate_group_score(VP9_COMP *cpi, double av_score, + int frame_count) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + const FIRSTPASS_STATS *s = twopass->stats_in; + double score_total = 0.0; + int i = 0; + + // We dont ever want to return a 0 score here. + if (frame_count == 0) return 1.0; + + while ((i < frame_count) && (s < twopass->stats_in_end)) { + score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score); + ++s; + ++i; + } + assert(i == frame_count); + + return score_total; +} + static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int gf_arf_bits) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; @@ -1885,7 +2201,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int frame_index = 1; int target_frame_size; int key_frame; - const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); + const int max_bits = frame_max_bits(&cpi->rc, oxcf); int64_t total_group_bits = gf_group_bits; int mid_boost_bits = 0; int mid_frame_idx; @@ -1895,8 +2211,10 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1; int normal_frames; int normal_frame_bits; - int last_frame_bits; - int last_frame_reduction; + int last_frame_reduction = 0; + double av_score = 1.0; + double tot_norm_frame_score = 1.0; + double this_frame_score = 1.0; // Only encode alt reference frame in temporal base layer. if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers; @@ -1968,17 +2286,14 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending); - - // The last frame in the group is used less as a predictor so reduce - // its allocation a little. - if (normal_frames > 1) { + if (normal_frames > 1) normal_frame_bits = (int)(total_group_bits / normal_frames); - last_frame_reduction = normal_frame_bits / 16; - last_frame_bits = normal_frame_bits - last_frame_reduction; - } else { + else normal_frame_bits = (int)total_group_bits; - last_frame_bits = normal_frame_bits; - last_frame_reduction = 0; + + if (oxcf->vbr_corpus_complexity) { + av_score = get_distribution_av_err(cpi, twopass); + tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames); } // Allocate bits to the other frames in the group. @@ -1990,11 +2305,18 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, ++frame_index; } - target_frame_size = (i == (normal_frames - 1)) - ? last_frame_bits - : (i == mid_frame_idx) - ? normal_frame_bits + last_frame_reduction - : normal_frame_bits; + if (oxcf->vbr_corpus_complexity) { + this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf, + &frame_stats, av_score); + normal_frame_bits = (int)((double)total_group_bits * + (this_frame_score / tot_norm_frame_score)); + } + + target_frame_size = normal_frame_bits; + if ((i == (normal_frames - 1)) && (i >= 1)) { + last_frame_reduction = normal_frame_bits / 16; + target_frame_size -= last_frame_reduction; + } if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { mid_boost_bits += (target_frame_size >> 4); @@ -2015,6 +2337,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, ++frame_index; } + // Add in some extra bits for the middle frame in the group. + gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction; + // Note: // We need to configure the frame at the end of the sequence + 1 that will be // the start frame for the next group. Otherwise prior to the call to @@ -2057,6 +2382,9 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise, } // Analyse and define a gf/arf group. +#define ARF_DECAY_BREAKOUT 0.10 +#define ARF_ABS_ZOOM_THRESH 4.0 + static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -2066,8 +2394,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { const FIRSTPASS_STATS *const start_pos = twopass->stats_in; int i; - double boost_score = 0.0; - double old_boost_score = 0.0; double gf_group_err = 0.0; double gf_group_raw_error = 0.0; double gf_group_noise = 0.0; @@ -2079,9 +2405,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mod_frame_err = 0.0; double mv_ratio_accumulator = 0.0; - double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; - double loop_decay_rate = 1.00; double last_loop_decay_rate = 1.00; @@ -2089,10 +2413,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; double mv_ratio_accumulator_thresh; + double abs_mv_in_out_thresh; + double sr_accumulator = 0.0; + const double av_err = get_distribution_av_err(cpi, twopass); unsigned int allow_alt_ref = is_altref_enabled(cpi); - int f_boost = 0; - int b_boost = 0; int flash_detected; int active_max_gf_interval; int active_min_gf_interval; @@ -2111,7 +2436,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { vp9_zero(next_frame); // Load stats for the current frame. - mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + mod_frame_err = + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); // Note the error of the frame at the start of the group. This will be // the GF frame error if we code a normal gf. @@ -2132,6 +2458,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Motion breakout threshold for loop below depends on image size. mv_ratio_accumulator_thresh = (cpi->initial_height + cpi->initial_width) / 4.0; + abs_mv_in_out_thresh = ARF_ABS_ZOOM_THRESH; // Set a maximum and minimum interval for the GF group. // If the image appears almost completely static we can extend beyond this. @@ -2142,8 +2469,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->common.bit_depth)); active_min_gf_interval = rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200); - if (active_min_gf_interval > rc->max_gf_interval) - active_min_gf_interval = rc->max_gf_interval; + active_min_gf_interval = + VPXMIN(active_min_gf_interval, rc->max_gf_interval + arf_active_or_kf); if (cpi->multi_arf_allowed) { active_max_gf_interval = rc->max_gf_interval; @@ -2154,11 +2481,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // interval to spread the cost of the GF. active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6)); - // We have: active_min_gf_interval <= rc->max_gf_interval - if (active_max_gf_interval < active_min_gf_interval) + // We have: active_min_gf_interval <= + // rc->max_gf_interval + arf_active_or_kf. + if (active_max_gf_interval < active_min_gf_interval) { active_max_gf_interval = active_min_gf_interval; - else if (active_max_gf_interval > rc->max_gf_interval) - active_max_gf_interval = rc->max_gf_interval; + } else { + active_max_gf_interval = VPXMIN(active_max_gf_interval, + rc->max_gf_interval + arf_active_or_kf); + } // Would the active max drop us out just before the near the next kf? if ((active_max_gf_interval <= rc->frames_to_key) && @@ -2172,7 +2502,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ++i; // Accumulate error score of frames in this gf group. - mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + mod_frame_err = + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); gf_group_err += mod_frame_err; gf_group_raw_error += this_frame->coded_error; gf_group_noise += this_frame->frame_noise_energy; @@ -2197,8 +2528,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { last_loop_decay_rate = loop_decay_rate; loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); - decay_accumulator = decay_accumulator * loop_decay_rate; - // Monitor for static sections. zero_motion_accumulator = VPXMIN( zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); @@ -2210,12 +2539,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { allow_alt_ref = 0; break; } - } - // Calculate a boost number for this frame. - boost_score += - decay_accumulator * - calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + if (i == 1) { + sr_accumulator += next_frame.coded_error; + } else { + sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error); + } + } // Break out conditions. if ( @@ -2228,15 +2561,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ((rc->frames_to_key - i) >= rc->min_gf_interval) && (!flash_detected) && ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || - (abs_mv_in_out_accumulator > 3.0) || - (mv_in_out_accumulator < -2.0) || - ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) { - boost_score = old_boost_score; + (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) || + (sr_accumulator > next_frame.intra_error)))) { break; } *this_frame = next_frame; - old_boost_score = boost_score; } // Was the group length constrained by the requirement for a new KF? @@ -2245,9 +2575,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Should we use the alternate reference frame. if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { + const int forward_frames = (rc->frames_to_key - i >= i - 1) + ? i - 1 + : VPXMAX(0, rc->frames_to_key - i); + // Calculate the boost for alt ref. - rc->gfu_boost = - calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); + rc->gfu_boost = calc_arf_boost(cpi, forward_frames, (i - 1)); rc->source_alt_ref_pending = 1; // Test to see if multi arf is appropriate. @@ -2257,10 +2590,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ? 1 : 0; } else { - rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST); + rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1)); rc->source_alt_ref_pending = 0; } +#ifdef AGGRESSIVE_VBR + // Limit maximum boost based on interval length. + rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 140); +#else + rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200); +#endif + // Set the interval until the next gf. rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); @@ -2273,7 +2613,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int j; for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) { if (EOF == input_stats(twopass, this_frame)) break; - gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + gf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); gf_group_raw_error += this_frame->coded_error; gf_group_noise += this_frame->frame_noise_energy; gf_group_skip_pct += this_frame->intra_skip_pct; @@ -2293,7 +2634,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); // Calculate an estimate of the maxq needed for the group. - // We are more agressive about correcting for sections + // We are more aggressive about correcting for sections // where there could be significant overshoot than for easier // sections where we do not wish to risk creating an overshoot // of the allocated bit budget. @@ -2312,6 +2653,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { group_av_noise, vbr_group_bits_per_frame); twopass->active_worst_quality = (tmp_q + (twopass->active_worst_quality * 3)) >> 2; + +#if CONFIG_ALWAYS_ADJUST_BPM + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 0; + twopass->rolling_arf_group_actual_bits = 0; +#endif } // Context Adjustment of ARNR filter strength @@ -2328,7 +2675,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_bits); // Adjust KF group bits and error remaining. - twopass->kf_group_error_left -= (int64_t)gf_group_err; + twopass->kf_group_error_left -= gf_group_err; // Allocate bits to each of the frames in the GF group. allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits); @@ -2346,10 +2693,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default to starting GF groups at normal frame size. cpi->rc.next_frame_size_selector = UNSCALED; } - +#if !CONFIG_ALWAYS_ADJUST_BPM // Reset rolling actual and target bits counters for ARF groups. twopass->rolling_arf_group_target_bits = 0; twopass->rolling_arf_group_actual_bits = 0; +#endif } // Threshold for use of the lagging second reference frame. High second ref @@ -2377,6 +2725,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // ratio in the next frame. #define II_IMPROVEMENT_THRESHOLD 3.5 #define KF_II_MAX 128.0 +#define II_FACTOR 12.5 +// Test for very low intra complexity which could cause false key frames +#define V_LOW_INTRA 0.5 static int test_candidate_kf(TWO_PASS *twopass, const FIRSTPASS_STATS *last_frame, @@ -2416,7 +2767,7 @@ static int test_candidate_kf(TWO_PASS *twopass, // Examine how well the key frame predicts subsequent frames. for (i = 0; i < 16; ++i) { - double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error / + double next_iiratio = (II_FACTOR * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; @@ -2436,7 +2787,7 @@ static int test_candidate_kf(TWO_PASS *twopass, 0.20) && (next_iiratio < 3.0)) || ((boost_score - old_boost_score) < 3.0) || - (local_next_frame.intra_error < 200)) { + (local_next_frame.intra_error < V_LOW_INTRA)) { break; } @@ -2462,6 +2813,16 @@ static int test_candidate_kf(TWO_PASS *twopass, } #define FRAMES_TO_CHECK_DECAY 8 +#define MIN_KF_TOT_BOOST 300 +#define KF_BOOST_SCAN_MAX_FRAMES 32 + +#ifdef AGGRESSIVE_VBR +#define KF_MAX_FRAME_BOOST 80.0 +#define MAX_KF_TOT_BOOST 4800 +#else +#define KF_MAX_FRAME_BOOST 96.0 +#define MAX_KF_TOT_BOOST 5400 +#endif static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int i, j; @@ -2474,15 +2835,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { FIRSTPASS_STATS next_frame; FIRSTPASS_STATS last_frame; int kf_bits = 0; - int loop_decay_counter = 0; double decay_accumulator = 1.0; - double av_decay_accumulator = 0.0; double zero_motion_accumulator = 1.0; double boost_score = 0.0; double kf_mod_err = 0.0; double kf_group_err = 0.0; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; - + double sr_accumulator = 0.0; + const double av_err = get_distribution_av_err(cpi, twopass); vp9_zero(next_frame); cpi->common.frame_type = KEY_FRAME; @@ -2503,10 +2863,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->frames_to_key = 1; - twopass->kf_group_bits = 0; // Total bits available to kf group - twopass->kf_group_error_left = 0; // Group modified error score. + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0.0; // Group modified error score. - kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_mod_err = + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); // Initialize the decay rates for the recent frames to check for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; @@ -2516,7 +2877,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { while (twopass->stats_in < twopass->stats_in_end && rc->frames_to_key < cpi->oxcf.key_freq) { // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); // Load the next frame's stats. last_frame = *this_frame; @@ -2576,7 +2938,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Rescan to get the correct error data for the forced kf group. for (i = 0; i < rc->frames_to_key; ++i) { - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame); + kf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame, av_err); input_stats(twopass, &tmp_frame); } rc->next_key_frame_forced = 1; @@ -2593,7 +2956,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int j; for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) { if (EOF == input_stats(twopass, this_frame)) break; - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); } rc->frames_to_key = new_frame_to_key; } @@ -2601,11 +2965,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_group_err += + calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); } // Calculate the number of bits that should be assigned to the kf group. - if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { + if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) { // Maximum number of bits for a single normal frame (not key frame). const int max_bits = frame_max_bits(rc, &cpi->oxcf); @@ -2615,7 +2980,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default allocation based on bits left and relative // complexity of the section. twopass->kf_group_bits = (int64_t)( - twopass->bits_left * (kf_group_err / twopass->modified_error_left)); + twopass->bits_left * (kf_group_err / twopass->normalized_score_left)); // Clip based on maximum per frame rate defined by the user. max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; @@ -2631,34 +2996,36 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Scan through the kf group collating various stats used to determine // how many bits to spend on it. - decay_accumulator = 1.0; boost_score = 0.0; + for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; - // Monitor for static sections. - zero_motion_accumulator = VPXMIN(zero_motion_accumulator, - get_zero_motion_factor(cpi, &next_frame)); + if (i <= KF_BOOST_SCAN_MAX_FRAMES) { + double frame_boost; + double zm_factor; - // Not all frames in the group are necessarily used in calculating boost. - if ((i <= rc->max_gf_interval) || - ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) { - const double frame_boost = - calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST); + // Monitor for static sections. + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); - // How fast is prediction quality decaying. - if (!detect_flash(twopass, 0)) { - const double loop_decay_rate = - get_prediction_decay_rate(cpi, &next_frame); - decay_accumulator *= loop_decay_rate; - decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR); - av_decay_accumulator += decay_accumulator; - ++loop_decay_counter; - } - boost_score += (decay_accumulator * frame_boost); + // Factor 0.75-1.25 based on how much of frame is static. + zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); + + // The second (lagging) ref error is not valid immediately after + // a key frame because either the lag has not built up (in the case of + // the first key frame or it points to a refernce before the new key + // frame. + if (i < 2) sr_accumulator = 0.0; + frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, + KF_MAX_FRAME_BOOST * zm_factor); + + boost_score += frame_boost; + if (frame_boost < 25.00) break; + } else { + break; } } - av_decay_accumulator /= (double)loop_decay_counter; reset_fpf_position(twopass, start_position); @@ -2670,9 +3037,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { start_position, twopass->stats_in_end, rc->frames_to_key); // Apply various clamps for min and max boost - rc->kf_boost = (int)(av_decay_accumulator * boost_score); - rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3)); - rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST); + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, @@ -2686,12 +3053,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group->rf_level[0] = KF_STD; // Note the total error score of the kf group minus the key frame itself. - twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); + twopass->kf_group_error_left = (kf_group_err - kf_mod_err); // Adjust the count of total modified error left. // The count of bits left is adjusted elsewhere based on real coded frame // sizes. - twopass->modified_error_left -= kf_group_err; + twopass->normalized_score_left -= kf_group_err; if (oxcf->resize_mode == RESIZE_DYNAMIC) { // Default to normal-sized frame on keyframes. @@ -2923,16 +3290,10 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { target_rate = gf_group->bit_allocation[gf_group->index]; rc->base_frame_target = target_rate; - { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cpi->common.MBs; - // The multiplication by 256 reverses a scaling factor of (>> 8) - // applied when combining MB error values for the frame. - twopass->mb_av_energy = - log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0); - twopass->mb_smooth_pct = this_frame.intra_smooth_pct; - } + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0); + twopass->mb_smooth_pct = this_frame.intra_smooth_pct; // Update the total stats remaining structure. subtract_stats(&twopass->total_left_stats, &this_frame); diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.h b/libs/libvpx/vp9/encoder/vp9_firstpass.h index 6aa39cdc00..000ecd7792 100644 --- a/libs/libvpx/vp9/encoder/vp9_firstpass.h +++ b/libs/libvpx/vp9/encoder/vp9_firstpass.h @@ -39,6 +39,39 @@ typedef struct { } FIRSTPASS_MB_STATS; #endif +#define INVALID_ROW -1 + +typedef struct { + double frame_mb_intra_factor; + double frame_mb_brightness_factor; + double frame_mb_neutral_count; +} FP_MB_FLOAT_STATS; + +typedef struct { + double intra_factor; + double brightness_factor; + int64_t coded_error; + int64_t sr_coded_error; + int64_t frame_noise_energy; + int64_t intra_error; + int intercount; + int second_ref_count; + double neutral_count; + double intra_count_low; // Coded intra but low variance + double intra_count_high; // Coded intra high variance + int intra_skip_count; + int image_data_start_row; + int mvcount; + int sum_mvr; + int sum_mvr_abs; + int sum_mvc; + int sum_mvc_abs; + int64_t sum_mvrs; + int64_t sum_mvcs; + int sum_in_vectors; + int intra_smooth_count; +} FIRSTPASS_DATA; + typedef struct { double frame; double weight; @@ -50,6 +83,8 @@ typedef struct { double pcnt_motion; double pcnt_second_ref; double pcnt_neutral; + double pcnt_intra_low; // Coded intra but low variance + double pcnt_intra_high; // Coded intra high variance double intra_skip_pct; double intra_smooth_pct; // % of blocks that are smooth double inactive_zone_rows; // Image mask rows top and bottom. @@ -61,7 +96,6 @@ typedef struct { double MVrv; double MVcv; double mv_in_out_count; - double new_mv_count; double duration; double count; int64_t spatial_layer_id; @@ -104,9 +138,8 @@ typedef struct { FIRSTPASS_STATS total_left_stats; int first_pass_done; int64_t bits_left; - double modified_error_min; - double modified_error_max; - double modified_error_left; + double mean_mod_score; + double normalized_score_left; double mb_av_energy; double mb_smooth_pct; @@ -115,6 +148,9 @@ typedef struct { uint8_t *this_frame_mb_stats; FIRSTPASS_MB_STATS firstpass_mb_stats; #endif + + FP_MB_FLOAT_STATS *fp_mb_float_stats; + // An indication of the content type of the current frame FRAME_CONTENT_TYPE fr_content_type; @@ -122,7 +158,7 @@ typedef struct { int64_t kf_group_bits; // Error score of frames still to be coded in kf group - int64_t kf_group_error_left; + double kf_group_error_left; double bpm_factor; int rolling_arf_group_target_bits; @@ -142,12 +178,20 @@ typedef struct { } TWO_PASS; struct VP9_COMP; +struct ThreadData; +struct TileDataEnc; void vp9_init_first_pass(struct VP9_COMP *cpi); void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); void vp9_end_first_pass(struct VP9_COMP *cpi); +void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi, + struct ThreadData *td, + FIRSTPASS_DATA *fp_acc_data, + struct TileDataEnc *tile_data, + MV *best_ref_mv, int mb_row); + void vp9_init_second_pass(struct VP9_COMP *cpi); void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); void vp9_twopass_postencode_update(struct VP9_COMP *cpi); diff --git a/libs/libvpx/vp9/encoder/vp9_frame_scale.c b/libs/libvpx/vp9/encoder/vp9_frame_scale.c new file mode 100644 index 0000000000..a410d0407f --- /dev/null +++ b/libs/libvpx/vp9/encoder/vp9_frame_scale.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_scale/yv12config.h" + +void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; + const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; + int x, y, i; + +#if HAVE_SSSE3 || HAVE_NEON + // TODO(linfengz): The 4:3 specialized C code is disabled by default since + // it's much slower than the general version which calls vpx_scaled_2d() even + // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference + // for the platforms which have faster optimization. + if (4 * dst->y_crop_width == 3 * src_w && + 4 * dst->y_crop_height == 3 * src_h) { + // Specialize 4 to 3 scaling. + // Example pixel locations. + // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.) + // phase_scaler = 0 | phase_scaler = 8 + // | + // X O S O S O X | O O O O O + // | + // | + // | S S S + // | + // | + // O O O O O | O O O O O + // | + // S S S S | + // | + // | + // | S S S + // O O O O O | O O O O O + // | + // | + // | + // S S S S | + // | + // O O O O O | O O O O O + // | S S S + // | + // | + // | + // | + // X O S O S O X | O O O O O + + const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width }; + const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height }; + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int dst_w = dst_ws[i]; + const int dst_h = dst_hs[i]; + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 3) { + for (x = 0; x < dst_w; x += 3) { + const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3; + uint8_t *dst_ptr = dsts[i] + y * dst_stride + x; + + // Must call c function because its optimization doesn't support 3x3. + vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3); + } + } + } + } else +#endif + { + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int factor = (i == 0 || i == 3 ? 1 : 2); + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 16) { + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; + for (x = 0; x < dst_w; x += 16) { + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; + const uint8_t *src_ptr = srcs[i] + + (y / factor) * src_h / dst_h * src_stride + + (x / factor) * src_w / dst_w; + uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); + + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); + } + } + } + } + + vpx_extend_frame_borders(dst); +} diff --git a/libs/libvpx/vp9/encoder/vp9_job_queue.h b/libs/libvpx/vp9/encoder/vp9_job_queue.h new file mode 100644 index 0000000000..89c08f207a --- /dev/null +++ b/libs/libvpx/vp9/encoder/vp9_job_queue.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_ +#define VP9_ENCODER_VP9_JOB_QUEUE_H_ + +typedef enum { + FIRST_PASS_JOB, + ENCODE_JOB, + ARNR_JOB, + NUM_JOB_TYPES, +} JOB_TYPE; + +// Encode job parameters +typedef struct { + int vert_unit_row_num; // Index of the vertical unit row + int tile_col_id; // tile col id within a tile + int tile_row_id; // tile col id within a tile +} JobNode; + +// Job queue element parameters +typedef struct { + // Pointer to the next link in the job queue + void *next; + + // Job information context of the module + JobNode job_info; +} JobQueue; + +// Job queue handle +typedef struct { + // Pointer to the next link in the job queue + void *next; + + // Counter to store the number of jobs picked up for processing + int num_jobs_acquired; +} JobQueueHandle; + +#endif // VP9_ENCODER_VP9_JOB_QUEUE_H_ diff --git a/libs/libvpx/vp9/encoder/vp9_mbgraph.c b/libs/libvpx/vp9/encoder/vp9_mbgraph.c index e000220b97..46d626def1 100644 --- a/libs/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/libs/libvpx/vp9/encoder/vp9_mbgraph.c @@ -45,10 +45,13 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, mv_sf->search_method = HEX; vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param, - x->errorperbit, cond_cost_list(cpi, cost_list), ref_mv, - dst_mv, 0, 0); + cpi->sf.mv.search_method, x->errorperbit, + cond_cost_list(cpi, cost_list), ref_mv, dst_mv, 0, 0); mv_sf->search_method = old_search_method; + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) { @@ -66,9 +69,6 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16); - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride); } diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.c b/libs/libvpx/vp9/encoder/vp9_mcomp.c index 2d9bcbda67..44f01be25a 100644 --- a/libs/libvpx/vp9/encoder/vp9_mcomp.c +++ b/libs/libvpx/vp9/encoder/vp9_mcomp.c @@ -21,6 +21,7 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_encoder.h" @@ -52,6 +53,24 @@ void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) { if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; } +void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits, + const MvLimits *umv_window_limits, + const MV *ref_mv) { + subpel_mv_limits->col_min = VPXMAX(umv_window_limits->col_min * 8, + ref_mv->col - MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->col_max = VPXMIN(umv_window_limits->col_max * 8, + ref_mv->col + MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->row_min = VPXMAX(umv_window_limits->row_min * 8, + ref_mv->row - MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->row_max = VPXMIN(umv_window_limits->row_max * 8, + ref_mv->row + MAX_FULL_PEL_VAL * 8); + + subpel_mv_limits->col_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->col_min); + subpel_mv_limits->col_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->col_max); + subpel_mv_limits->row_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->row_min); + subpel_mv_limits->row_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->row_max); +} + int vp9_init_search_range(int size) { int sr = 0; // Minimum search size no matter what the passed in value. @@ -82,10 +101,8 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost, int *mvcost[2], int error_per_bit) { if (mvcost) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; - // This product sits at a 32-bit ceiling right now and any additional - // accuracy in either bit cost or error cost will cause it to overflow. - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, + return (int)ROUND64_POWER_OF_TWO( + (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT + PIXEL_TRANSFORM_ERROR_SCALE); } @@ -138,17 +155,6 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) { cfg->total_steps = ss_count / cfg->searches_per_step; } -/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes - * from the same math as in mv_err_cost(). */ -#define MVC(r, c) \ - (mvcost \ - ? ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] + \ - mvcost[0][((r)-rr)] + mvcost[1][((c)-rc)]) * \ - error_per_bit + \ - 8192) >> \ - 14 \ - : 0) - // convert motion vector component to offset for sv[a]f calc static INLINE int sp(int x) { return x & 7; } @@ -161,6 +167,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { #define CHECK_BETTER(v, r, c) \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ int64_t tmpmse; \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ if (second_pred == NULL) { \ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse); \ @@ -169,7 +177,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { src_stride, &sse, second_pred); \ } \ tmpmse = thismse; \ - tmpmse += MVC(r, c); \ + tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ if (tmpmse >= INT_MAX) { \ v = INT_MAX; \ } else if ((v = (uint32_t)tmpmse) < besterr) { \ @@ -186,13 +194,16 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { /* checks if (r, c) has better score than previous best */ #define CHECK_BETTER(v, r, c) \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ if (second_pred == NULL) \ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse); \ else \ thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse, second_pred); \ - if ((v = MVC(r, c) + thismse) < besterr) { \ + if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + thismse) < besterr) { \ besterr = v; \ br = r; \ bc = c; \ @@ -273,34 +284,38 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { } \ } -#define SETUP_SUBPEL_SEARCH \ - const uint8_t *const z = x->plane[0].src.buf; \ - const int src_stride = x->plane[0].src.stride; \ - const MACROBLOCKD *xd = &x->e_mbd; \ - unsigned int besterr = INT_MAX; \ - unsigned int sse; \ - unsigned int whichdir; \ - int thismse; \ - const unsigned int halfiters = iters_per_step; \ - const unsigned int quarteriters = iters_per_step; \ - const unsigned int eighthiters = iters_per_step; \ - const int y_stride = xd->plane[0].pre[0].stride; \ - const int offset = bestmv->row * y_stride + bestmv->col; \ - const uint8_t *const y = xd->plane[0].pre[0].buf; \ - \ - int rr = ref_mv->row; \ - int rc = ref_mv->col; \ - int br = bestmv->row * 8; \ - int bc = bestmv->col * 8; \ - int hstep = 4; \ - const int minc = VPXMAX(x->mv_limits.col_min * 8, ref_mv->col - MV_MAX); \ - const int maxc = VPXMIN(x->mv_limits.col_max * 8, ref_mv->col + MV_MAX); \ - const int minr = VPXMAX(x->mv_limits.row_min * 8, ref_mv->row - MV_MAX); \ - const int maxr = VPXMIN(x->mv_limits.row_max * 8, ref_mv->row + MV_MAX); \ - int tr = br; \ - int tc = bc; \ - \ - bestmv->row *= 8; \ +#define SETUP_SUBPEL_SEARCH \ + const uint8_t *const z = x->plane[0].src.buf; \ + const int src_stride = x->plane[0].src.stride; \ + const MACROBLOCKD *xd = &x->e_mbd; \ + unsigned int besterr = UINT_MAX; \ + unsigned int sse; \ + unsigned int whichdir; \ + int thismse; \ + const unsigned int halfiters = iters_per_step; \ + const unsigned int quarteriters = iters_per_step; \ + const unsigned int eighthiters = iters_per_step; \ + const int y_stride = xd->plane[0].pre[0].stride; \ + const int offset = bestmv->row * y_stride + bestmv->col; \ + const uint8_t *const y = xd->plane[0].pre[0].buf; \ + \ + int rr = ref_mv->row; \ + int rc = ref_mv->col; \ + int br = bestmv->row * 8; \ + int bc = bestmv->col * 8; \ + int hstep = 4; \ + int minc, maxc, minr, maxr; \ + int tr = br; \ + int tc = bc; \ + MvLimits subpel_mv_limits; \ + \ + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); \ + minc = subpel_mv_limits.col_min; \ + maxc = subpel_mv_limits.col_max; \ + minr = subpel_mv_limits.row_min; \ + maxr = subpel_mv_limits.row_max; \ + \ + bestmv->row *= 8; \ bestmv->col *= 8; static unsigned int setup_center_error( @@ -346,7 +361,7 @@ static unsigned int setup_center_error( #endif // CONFIG_VP9_HIGHBITDEPTH } -static INLINE int divide_and_round(const int n, const int d) { +static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) { return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); } @@ -364,10 +379,13 @@ static INLINE int is_cost_list_wellbehaved(int *cost_list) { // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). // The code below is an integerized version of that. static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { - *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)), - (cost_list[1] - 2 * cost_list[0] + cost_list[3])); - *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)), - (cost_list[4] - 2 * cost_list[0] + cost_list[2])); + const int64_t x0 = (int64_t)cost_list[1] - cost_list[3]; + const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3]; + const int64_t x1 = (int64_t)cost_list[4] - cost_list[2]; + const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2]; + const int b = 1 << (bits - 1); + *ic = (int)divide_and_round(x0 * b, y0); + *ir = (int)divide_and_round(x1 * b, y1); } uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, @@ -401,10 +419,6 @@ uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, (void)thismse; (void)cost_list; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return UINT_MAX; - return besterr; } @@ -430,7 +444,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { int ir, ic; - unsigned int minpt; + unsigned int minpt = INT_MAX; get_cost_surf_min(cost_list, &ir, &ic, 2); if (ir != 0 || ic != 0) { CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic); @@ -470,10 +484,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( bestmv->row = br; bestmv->col = bc; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; - return besterr; } @@ -534,10 +544,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more( bestmv->row = br; bestmv->col = bc; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return UINT_MAX; - return besterr; } @@ -620,10 +626,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned( bestmv->row = br; bestmv->col = bc; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; - return besterr; } @@ -646,7 +648,7 @@ uint32_t vp9_find_best_sub_pixel_tree( const uint8_t *const src_address = z; const int src_stride = x->plane[0].src.stride; const MACROBLOCKD *xd = &x->e_mbd; - unsigned int besterr = INT_MAX; + unsigned int besterr = UINT_MAX; unsigned int sse; int thismse; const int y_stride = xd->plane[0].pre[0].stride; @@ -659,16 +661,21 @@ uint32_t vp9_find_best_sub_pixel_tree( int bc = bestmv->col * 8; int hstep = 4; int iter, round = 3 - forced_stop; - const int minc = VPXMAX(x->mv_limits.col_min * 8, ref_mv->col - MV_MAX); - const int maxc = VPXMIN(x->mv_limits.col_max * 8, ref_mv->col + MV_MAX); - const int minr = VPXMAX(x->mv_limits.row_min * 8, ref_mv->row - MV_MAX); - const int maxr = VPXMIN(x->mv_limits.row_max * 8, ref_mv->row + MV_MAX); + + int minc, maxc, minr, maxr; int tr = br; int tc = bc; const MV *search_step = search_step_table; int idx, best_idx = -1; unsigned int cost_array[5]; int kr, kc; + MvLimits subpel_mv_limits; + + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + minc = subpel_mv_limits.col_min; + maxc = subpel_mv_limits.col_max; + minr = subpel_mv_limits.row_min; + maxr = subpel_mv_limits.row_max; if (!(allow_hp && use_mv_hp(ref_mv))) if (round == 3) round = 2; @@ -708,7 +715,7 @@ uint32_t vp9_find_best_sub_pixel_tree( *sse1 = sse; } } else { - cost_array[idx] = INT_MAX; + cost_array[idx] = UINT_MAX; } } @@ -737,7 +744,7 @@ uint32_t vp9_find_best_sub_pixel_tree( *sse1 = sse; } } else { - cost_array[idx] = INT_MAX; + cost_array[idx] = UINT_MAX; } if (best_idx < 4 && best_idx >= 0) { @@ -769,14 +776,9 @@ uint32_t vp9_find_best_sub_pixel_tree( bestmv->row = br; bestmv->col = bc; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; - return besterr; } -#undef MVC #undef CHECK_BETTER static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col, @@ -858,13 +860,12 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv, // candidates as indicated in the num_candidates and candidates arrays // passed into this function // -static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param, - int sad_per_bit, int do_init_search, - int *cost_list, const vp9_variance_fn_ptr_t *vfp, - int use_mvcost, const MV *center_mv, MV *best_mv, - const int num_candidates[MAX_PATTERN_SCALES], - const MV candidates[MAX_PATTERN_SCALES] - [MAX_PATTERN_CANDIDATES]) { +static int vp9_pattern_search( + const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, + int do_init_search, int *cost_list, const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, const MV *center_mv, MV *best_mv, + const int num_candidates[MAX_PATTERN_SCALES], + const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) { const MACROBLOCKD *const xd = &x->e_mbd; static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, @@ -1822,7 +1823,9 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, } #if CONFIG_VP9_HIGHBITDEPTH - { + // TODO(jingning): Implement integral projection functions for high bit-depth + // setting and remove this part of code. + if (xd->bd != 8) { unsigned int this_sad; tmp_mv->row = 0; tmp_mv->col = 0; @@ -1998,9 +2001,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, int range = sf->mesh_patterns[0].range; int baseline_interval_divisor; - // Keep track of number of exhaustive calls (this frame in this thread). - ++(*x->ex_search_count_ptr); - // Trap illegal values for interval and range for this function. if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || (interval > range)) @@ -2042,197 +2042,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, return bestsme; } -int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r, c; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - for (c = col_min; c < col_max; ++c) { - const MV mv = { r, c }; - const int sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), - in_what->stride) + - mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - } - return best_sad; -} - -int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - int c = col_min; - const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - - if (fn_ptr->sdx3f != NULL) { - while ((c + 2) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[3]); - - fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 3; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - while (c < col_max) { - unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - - return best_sad; -} - -int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - int c = col_min; - const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - - if (fn_ptr->sdx8f != NULL) { - while ((c + 7) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[8]); - - fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 8; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - if (fn_ptr->sdx3f != NULL) { - while ((c + 2) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[3]); - - fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 3; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - while (c < col_max) { - unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - - return best_sad; -} - int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, const vp9_variance_fn_ptr_t *fn_ptr, @@ -2318,11 +2127,14 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = + unsigned int best_sad = INT_MAX; + int i, j; + clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + best_sad = fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); - int i, j; for (i = 0; i < search_range; ++i) { int best_site = -1; @@ -2355,24 +2167,12 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, return best_sad; } -#define MIN_EX_SEARCH_LIMIT 128 -static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) { - const SPEED_FEATURES *const sf = &cpi->sf; - const int max_ex = - VPXMAX(MIN_EX_SEARCH_LIMIT, - (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100); - - return sf->allow_exhaustive_searches && - (sf->exhaustive_searches_thresh < INT_MAX) && - (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref; -} - int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, - int *cost_list, const MV *ref_mv, MV *tmp_mv, - int var_max, int rd) { + MV *mvp_full, int step_param, int search_method, + int error_per_bit, int *cost_list, const MV *ref_mv, + MV *tmp_mv, int var_max, int rd) { const SPEED_FEATURES *const sf = &cpi->sf; - const SEARCH_METHODS method = sf->mv.search_method; + const SEARCH_METHODS method = (SEARCH_METHODS)search_method; vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; if (cost_list) { @@ -2383,9 +2183,6 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, cost_list[4] = INT_MAX; } - // Keep track of number of searches (this frame in this thread). - ++(*x->m_search_count_ptr); - switch (method) { case FAST_DIAMOND: var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, @@ -2413,7 +2210,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, cost_list, fn_ptr, ref_mv, tmp_mv); // Should we allow a follow on exhaustive search? - if (is_exhaustive_allowed(cpi, x)) { + if ((sf->exhaustive_searches_thresh < INT_MAX) && + !cpi->rc.is_src_frame_alt_ref) { int64_t exhuastive_thr = sf->exhaustive_searches_thresh; exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); @@ -2440,3 +2238,85 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, return var; } + +// Note(yunqingwang): The following 2 functions are only used in the motion +// vector unit test, which return extreme motion vectors allowed by the MV +// limits. +#define COMMON_MV_TEST \ + SETUP_SUBPEL_SEARCH; \ + \ + (void)error_per_bit; \ + (void)vfp; \ + (void)z; \ + (void)src_stride; \ + (void)y; \ + (void)y_stride; \ + (void)second_pred; \ + (void)w; \ + (void)h; \ + (void)offset; \ + (void)mvjcost; \ + (void)mvcost; \ + (void)sse1; \ + (void)distortion; \ + \ + (void)halfiters; \ + (void)quarteriters; \ + (void)eighthiters; \ + (void)whichdir; \ + (void)allow_hp; \ + (void)forced_stop; \ + (void)hstep; \ + (void)rr; \ + (void)rc; \ + \ + (void)tr; \ + (void)tc; \ + (void)sse; \ + (void)thismse; \ + (void)cost_list; + +// Return the maximum MV. +uint32_t vp9_return_max_sub_pixel_mv( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h) { + COMMON_MV_TEST; + + (void)minr; + (void)minc; + + bestmv->row = maxr; + bestmv->col = maxc; + besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv)); + + return besterr; +} +// Return the minimum MV. +uint32_t vp9_return_min_sub_pixel_mv( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h) { + COMMON_MV_TEST; + + (void)maxr; + (void)maxc; + + bestmv->row = minr; + bestmv->col = minc; + besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv)); + + return besterr; +} diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.h b/libs/libvpx/vp9/encoder/vp9_mcomp.h index 2726b9e230..b8db2c3536 100644 --- a/libs/libvpx/vp9/encoder/vp9_mcomp.h +++ b/libs/libvpx/vp9/encoder/vp9_mcomp.h @@ -81,6 +81,8 @@ extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned; extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more; extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore; extern fractional_mv_step_fp vp9_skip_sub_pixel_tree; +extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv; +extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, @@ -105,9 +107,13 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, struct VP9_COMP; int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, - int *cost_list, const MV *ref_mv, MV *tmp_mv, - int var_max, int rd); + MV *mvp_full, int step_param, int search_method, + int error_per_bit, int *cost_list, const MV *ref_mv, + MV *tmp_mv, int var_max, int rd); + +void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits, + const MvLimits *umv_window_limits, + const MV *ref_mv); #ifdef __cplusplus } // extern "C" diff --git a/libs/libvpx/vp9/encoder/vp9_multi_thread.c b/libs/libvpx/vp9/encoder/vp9_multi_thread.c new file mode 100644 index 0000000000..da06fb151d --- /dev/null +++ b/libs/libvpx/vp9/encoder/vp9_multi_thread.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_multi_thread.h" + +void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, + int tile_id) { + RowMTInfo *row_mt_info; + JobQueueHandle *job_queue_hdl = NULL; + void *next = NULL; + JobNode *job_info = NULL; +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_handle = NULL; +#endif + + row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]); + job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl; +#if CONFIG_MULTITHREAD + mutex_handle = &row_mt_info->job_mutex; +#endif + +// lock the mutex for queue access +#if CONFIG_MULTITHREAD + pthread_mutex_lock(mutex_handle); +#endif + next = job_queue_hdl->next; + if (NULL != next) { + JobQueue *job_queue = (JobQueue *)next; + job_info = &job_queue->job_info; + // Update the next job in the queue + job_queue_hdl->next = job_queue->next; + job_queue_hdl->num_jobs_acquired++; + } + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(mutex_handle); +#endif + + return job_info; +} + +void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { + struct VP9Common *cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_row, tile_col; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int jobs_per_tile_col, total_jobs; + + jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows); + // Calculate the total number of jobs + total_jobs = jobs_per_tile_col * tile_cols; + + multi_thread_ctxt->allocated_tile_cols = tile_cols; + multi_thread_ctxt->allocated_tile_rows = tile_rows; + multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col; + + multi_thread_ctxt->job_queue = + (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)); + +#if CONFIG_MULTITHREAD + // Create mutex for each tile + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col]; + pthread_mutex_init(&row_mt_info->job_mutex, NULL); + } +#endif + + // Allocate memory for row based multi-threading + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_col]; + vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col); + if (cpi->sf.adaptive_rd_thresh_row_mt) { + const int sb_rows = + (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1; + int i; + this_tile->row_base_thresh_freq_fact = + (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, + sizeof(*(this_tile->row_base_thresh_freq_fact))); + for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) + this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; + } + } + + // Assign the sync pointer of tile row zero for every tile row > 0 + for (tile_row = 1; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileDataEnc *this_col_tile = &cpi->tile_data[tile_col]; + this_tile->row_mt_sync = this_col_tile->row_mt_sync; + } + } + + // Calculate the number of vertical units in the given tile row + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols]; + TileInfo *tile_info = &this_tile->tile_info; + multi_thread_ctxt->num_tile_vert_sbs[tile_row] = + get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); + } +} + +void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_col; +#if CONFIG_MULTITHREAD + int tile_row; +#endif + + // Deallocate memory for job queue + if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue); + +#if CONFIG_MULTITHREAD + // Destroy mutex for each tile + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col]; + if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex); + } +#endif + + // Free row based multi-threading sync memory + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_col]; + vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); + } + +#if CONFIG_MULTITHREAD + for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows; + tile_row++) { + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = + &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + + tile_col]; + if (cpi->sf.adaptive_rd_thresh_row_mt) { + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; + } + } + } + } +#endif +} + +void vp9_multi_thread_tile_init(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int i; + + for (i = 0; i < tile_cols; i++) { + TileDataEnc *this_tile = &cpi->tile_data[i]; + int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows; + + // Initialize cur_col to -1 for all rows. + memset(this_tile->row_mt_sync.cur_col, -1, + sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col); + vp9_zero(this_tile->fp_data); + this_tile->fp_data.image_data_start_row = INVALID_ROW; + } +} + +void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, + int tile_cols, int num_workers) { + int tile_id = 0; + int i; + + // Allocating the threads for the tiles + for (i = 0; i < num_workers; i++) { + multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++; + if (tile_id == tile_cols) tile_id = 0; + } +} + +int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt, + int cur_tile_id) { + RowMTInfo *row_mt_info; + JobQueueHandle *job_queue_hndl; +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex; +#endif + int num_jobs_remaining; + + row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id]; + job_queue_hndl = &row_mt_info->job_queue_hdl; +#if CONFIG_MULTITHREAD + mutex = &row_mt_info->job_mutex; +#endif + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(mutex); +#endif + num_jobs_remaining = + multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(mutex); +#endif + + return (num_jobs_remaining); +} + +void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) { + VP9_COMMON *const cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + JobQueue *job_queue = multi_thread_ctxt->job_queue; + const int tile_cols = 1 << cm->log2_tile_cols; + int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int tile_col, i; + + jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows; + total_jobs = jobs_per_tile_col * tile_cols; + + multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col; + // memset the entire job queue buffer to zero + memset(job_queue, 0, total_jobs * sizeof(JobQueue)); + + // Job queue preparation + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col]; + JobQueue *job_queue_curr, *job_queue_temp; + int tile_row = 0; + + tile_ctxt->job_queue_hdl.next = (void *)job_queue; + tile_ctxt->job_queue_hdl.num_jobs_acquired = 0; + + job_queue_curr = job_queue; + job_queue_temp = job_queue; + + // loop over all the vertical rows + for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col; + job_row_num++, jobs_per_tile++) { + job_queue_curr->job_info.vert_unit_row_num = job_row_num; + job_queue_curr->job_info.tile_col_id = tile_col; + job_queue_curr->job_info.tile_row_id = tile_row; + job_queue_curr->next = (void *)(job_queue_temp + 1); + job_queue_curr = ++job_queue_temp; + + if (ENCODE_JOB == job_type) { + if (jobs_per_tile >= + multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) { + tile_row++; + jobs_per_tile = -1; + } + } + } + + // Set the last pointer to NULL + job_queue_curr += -1; + job_queue_curr->next = (void *)NULL; + + // Move to the next tile + job_queue += jobs_per_tile_col; + } + + for (i = 0; i < cpi->num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + thread_data->thread_id = i; + + for (tile_col = 0; tile_col < tile_cols; tile_col++) + thread_data->tile_completion_status[tile_col] = 0; + } +} + +int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, + int *tile_completion_status, int *cur_tile_id, + int tile_cols) { + int tile_col; + int tile_id = -1; // Stores the tile ID with minimum proc done + int max_num_jobs_remaining = 0; + int num_jobs_remaining; + + // Mark the completion to avoid check in the loop + tile_completion_status[*cur_tile_id] = 1; + // Check for the status of all the tiles + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + if (tile_completion_status[tile_col] == 0) { + num_jobs_remaining = + vp9_get_job_queue_status(multi_thread_ctxt, tile_col); + // Mark the completion to avoid checks during future switches across tiles + if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1; + if (num_jobs_remaining > max_num_jobs_remaining) { + max_num_jobs_remaining = num_jobs_remaining; + tile_id = tile_col; + } + } + } + + if (-1 == tile_id) { + return 1; + } else { + // Update the cur ID to the next tile ID that will be processed, + // which will be the least processed tile + *cur_tile_id = tile_id; + return 0; + } +} diff --git a/libs/libvpx/vp9/encoder/vp9_multi_thread.h b/libs/libvpx/vp9/encoder/vp9_multi_thread.h new file mode 100644 index 0000000000..bfc0c0ae4f --- /dev/null +++ b/libs/libvpx/vp9/encoder/vp9_multi_thread.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H +#define VP9_ENCODER_VP9_MULTI_THREAD_H + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_job_queue.h" + +void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, + int tile_id); + +void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type); + +int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt, + int cur_tile_id); + +void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, + int tile_cols, int num_workers); + +void vp9_multi_thread_tile_init(VP9_COMP *cpi); + +void vp9_row_mt_mem_alloc(VP9_COMP *cpi); + +void vp9_row_mt_mem_dealloc(VP9_COMP *cpi); + +int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, + int *tile_completion_status, int *cur_tile_id, + int tile_cols); + +#endif // VP9_ENCODER_VP9_MULTI_THREAD_H diff --git a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c index 0e5d8ade4a..276a0c7852 100644 --- a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c +++ b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c @@ -21,6 +21,15 @@ #include "vp9/encoder/vp9_noise_estimate.h" #include "vp9/encoder/vp9_encoder.h" +#if CONFIG_VP9_TEMPORAL_DENOISING +// For SVC: only do noise estimation on top spatial layer. +static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) { + return (!cpi->use_svc || + (cpi->use_svc && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); +} +#endif + void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->enabled = 0; ne->level = kLowLow; @@ -32,15 +41,22 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { if (width * height >= 1920 * 1080) { ne->thresh = 200; } else if (width * height >= 1280 * 720) { - ne->thresh = 130; + ne->thresh = 140; + } else if (width * height >= 640 * 360) { + ne->thresh = 115; } - ne->num_frames_estimate = 20; + ne->num_frames_estimate = 15; } static int enable_noise_estimation(VP9_COMP *const cpi) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) return 0; +#endif // Enable noise estimation if denoising is on. #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) return 1; + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->common.width >= 320 && cpi->common.height >= 180) + return 1; #endif // Only allow noise estimate under certain encoding mode. // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original. @@ -49,8 +65,8 @@ static int enable_noise_estimation(VP9_COMP *const cpi) { if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc && - cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 && - cpi->common.height >= 480) + cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->common.width * cpi->common.height >= 640 * 360) return 1; else return 0; @@ -92,6 +108,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { void vp9_update_noise_estimate(VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; NOISE_ESTIMATE *const ne = &cpi->noise_estimate; + const int low_res = (cm->width <= 352 && cm->height <= 288); // Estimate of noise level every frame_period frames. int frame_period = 8; int thresh_consec_zeromv = 6; @@ -99,17 +116,31 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { unsigned int thresh_sum_spatial = (200 * 200) << 8; unsigned int thresh_spatial_var = (32 * 32) << 8; int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7; + int frame_counter = cm->current_video_frame; // Estimate is between current source and last source. YV12_BUFFER_CONFIG *last_source = cpi->Last_Source; #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) last_source = &cpi->denoiser.last_source; + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) { + last_source = &cpi->denoiser.last_source; + // Tune these thresholds for different resolutions when denoising is + // enabled. + if (cm->width > 640 && cm->width < 1920) { + thresh_consec_zeromv = 4; + thresh_sum_diff = 200; + thresh_sum_spatial = (120 * 120) << 8; + thresh_spatial_var = (48 * 48) << 8; + } + } #endif ne->enabled = enable_noise_estimation(cpi); - if (!ne->enabled || cm->current_video_frame % frame_period != 0 || - last_source == NULL || ne->last_w != cm->width || - ne->last_h != cm->height) { + if (cpi->svc.number_spatial_layers > 1) + frame_counter = cpi->svc.current_superframe; + if (!ne->enabled || frame_counter % frame_period != 0 || + last_source == NULL || + (cpi->svc.number_spatial_layers == 1 && + (ne->last_w != cm->width || ne->last_h != cm->height))) { #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) copy_frame(&cpi->denoiser.last_source, cpi->Source); #endif if (last_source != NULL) { @@ -117,12 +148,18 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->last_h = cm->height; } return; - } else if (cpi->rc.avg_frame_low_motion < 50) { + } else if (cm->current_video_frame > 60 && + cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) { // Force noise estimation to 0 and denoiser off if content has high motion. ne->level = kLowLow; + ne->count = 0; + ne->num_frames_estimate = 10; #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->svc.current_superframe > 1) { vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); + copy_frame(&cpi->denoiser.last_source, cpi->Source); + } #endif return; } else { @@ -162,43 +199,42 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { int bl_index1 = bl_index + 1; int bl_index2 = bl_index + cm->mi_cols; int bl_index3 = bl_index2 + 1; - // Only consider blocks that are likely steady background. i.e, have - // been encoded as zero/low motion x (= thresh_consec_zeromv) frames - // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all - // 4 sub-blocks for 16x16 block. Also, avoid skin blocks. int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], VPXMIN(cpi->consec_zero_mv[bl_index1], VPXMIN(cpi->consec_zero_mv[bl_index2], cpi->consec_zero_mv[bl_index3]))); - int is_skin = 0; - if (cpi->use_skin_detection) { - is_skin = - vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, - src_uvstride, bsize, consec_zeromv, 0); - } - if (frame_low_motion && - cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv && - !is_skin) { - // Compute variance. - unsigned int sse; - unsigned int variance = cpi->fn_ptr[bsize].vf( - src_y, src_ystride, last_src_y, last_src_ystride, &sse); - // Only consider this block as valid for noise measurement if the - // average term (sse - variance = N * avg^{2}, N = 16X16) of the - // temporal residual is small (avoid effects from lighting change). - if ((sse - variance) < thresh_sum_diff) { - unsigned int sse2; - const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf( - src_y, src_ystride, const_source, 0, &sse2); - // Avoid blocks with high brightness and high spatial variance. - if ((sse2 - spatial_variance) < thresh_sum_spatial && - spatial_variance < thresh_spatial_var) { - avg_est += variance / ((spatial_variance >> 9) + 1); - num_samples++; + // Only consider blocks that are likely steady background. i.e, have + // been encoded as zero/low motion x (= thresh_consec_zeromv) frames + // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all + // 4 sub-blocks for 16x16 block. Also, avoid skin blocks. + if (frame_low_motion && consec_zeromv > thresh_consec_zeromv) { + int is_skin = 0; + if (cpi->use_skin_detection) { + is_skin = + vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, + src_uvstride, bsize, consec_zeromv, 0); + } + if (!is_skin) { + unsigned int sse; + // Compute variance. + unsigned int variance = cpi->fn_ptr[bsize].vf( + src_y, src_ystride, last_src_y, last_src_ystride, &sse); + // Only consider this block as valid for noise measurement if the + // average term (sse - variance = N * avg^{2}, N = 16X16) of the + // temporal residual is small (avoid effects from lighting + // change). + if ((sse - variance) < thresh_sum_diff) { + unsigned int sse2; + const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf( + src_y, src_ystride, const_source, 0, &sse2); + // Avoid blocks with high brightness and high spatial variance. + if ((sse2 - spatial_variance) < thresh_sum_spatial && + spatial_variance < thresh_spatial_var) { + avg_est += low_res ? variance >> 4 + : variance / ((spatial_variance >> 9) + 1); + num_samples++; + } } } } @@ -230,14 +266,14 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->count = 0; ne->level = vp9_noise_estimate_extract_level(ne); #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); #endif } } } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) copy_frame(&cpi->denoiser.last_source, cpi->Source); #endif } diff --git a/libs/libvpx/vp9/encoder/vp9_picklpf.c b/libs/libvpx/vp9/encoder/vp9_picklpf.c index 1583cc8abc..1c2c55b9e4 100644 --- a/libs/libvpx/vp9/encoder/vp9_picklpf.c +++ b/libs/libvpx/vp9/encoder/vp9_picklpf.c @@ -181,6 +181,11 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, #else int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); #endif // CONFIG_VP9_HIGHBITDEPTH + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME) + filt_guess = 5 * filt_guess >> 3; + if (cm->frame_type == KEY_FRAME) filt_guess -= 4; lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); } else { diff --git a/libs/libvpx/vp9/encoder/vp9_pickmode.c b/libs/libvpx/vp9/encoder/vp9_pickmode.c index 76d2611d89..f2f323a282 100644 --- a/libs/libvpx/vp9/encoder/vp9_pickmode.c +++ b/libs/libvpx/vp9/encoder/vp9_pickmode.c @@ -158,6 +158,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const MvLimits tmp_mv_limits = x->mv_limits; int rv = 0; int cost_list[5]; + int search_subpel = 1; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); if (scaled_ref_frame) { @@ -170,6 +171,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + // Limit motion vector for large lightning change. + if (cpi->oxcf.speed > 5 && x->lowvar_highsumdiff) { + x->mv_limits.col_min = VPXMAX(x->mv_limits.col_min, -10); + x->mv_limits.row_min = VPXMAX(x->mv_limits.row_min, -10); + x->mv_limits.col_max = VPXMIN(x->mv_limits.col_max, 10); + x->mv_limits.row_max = VPXMIN(x->mv_limits.row_max, 10); + } + assert(x->mv_best_ref_index[ref] <= 2); if (x->mv_best_ref_index[ref] < 2) mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv; @@ -184,9 +193,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, else center_mv = tmp_mv->as_mv; - vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), ¢er_mv, - &tmp_mv->as_mv, INT_MAX, 0); + if (x->sb_use_mv_part) { + tmp_mv->as_mv.row = x->sb_mvrow_part >> 3; + tmp_mv->as_mv.col = x->sb_mvcol_part >> 3; + } else { + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), ¢er_mv, &tmp_mv->as_mv, INT_MAX, 0); + } x->mv_limits = tmp_mv_limits; @@ -202,10 +216,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar); - if (rv) { - const int subpel_force_stop = use_base_mv && cpi->sf.base_mv_aggressive - ? 2 - : cpi->sf.mv.subpel_force_stop; + // For SVC on non-reference frame, avoid subpel for (0, 0) motion. + if (cpi->use_svc && cpi->svc.non_reference_frame) { + if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0; + } + + if (rv && search_subpel) { + int subpel_force_stop = cpi->sf.mv.subpel_force_stop; + if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2; cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, @@ -296,11 +314,24 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size, } } +// Adjust the ac_thr according to speed, width, height and normalized sum +static int ac_thr_factor(const int speed, const int width, const int height, + const int norm_sum) { + if (speed >= 8 && norm_sum < 5) { + if (width <= 640 && height <= 480) + return 4; + else + return 2; + } + return 1; +} + static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, unsigned int *var_y, unsigned int *sse_y, - int mi_row, int mi_col, int *early_term) { + int mi_row, int mi_col, int *early_term, + int *flag_preduv_computed) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -312,7 +343,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, const uint32_t dc_quant = pd->dequant[0]; const uint32_t ac_quant = pd->dequant[1]; const int64_t dc_thr = dc_quant * dc_quant >> 6; - const int64_t ac_thr = ac_quant * ac_quant >> 6; + int64_t ac_thr = ac_quant * ac_quant >> 6; unsigned int var; int sum; int skip_dc = 0; @@ -341,6 +372,20 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, *var_y = var; *sse_y = sse; +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5) + ac_thr = vp9_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level, + (abs(sum) >> (bw + bh)), + cpi->svc.temporal_layer_id); + else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width, + cpi->common.height, abs(sum) >> (bw + bh)); +#else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width, + cpi->common.height, abs(sum) >> (bw + bh)); +#endif + if (cpi->common.tx_mode == TX_MODE_SELECT) { if (sse > (var << 2)) tx_size = VPXMIN(max_txsize_lookup[bsize], @@ -428,28 +473,33 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, // Transform skipping test in UV planes. for (i = 1; i <= 2; i++) { - struct macroblock_plane *const p = &x->plane[i]; - struct macroblockd_plane *const pd = &xd->plane[i]; - const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); - const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; - const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); - const int uv_bw = b_width_log2_lookup[uv_bsize]; - const int uv_bh = b_height_log2_lookup[uv_bsize]; - const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + - (uv_bh - b_height_log2_lookup[unit_size]); - const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); - const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); - int j = i - 1; + if (cpi->oxcf.speed < 8 || x->color_sensitivity[i - 1]) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); + const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); + const int uv_bw = b_width_log2_lookup[uv_bsize]; + const int uv_bh = b_height_log2_lookup[uv_bsize]; + const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + + (uv_bh - b_height_log2_lookup[unit_size]); + const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); + const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); + int j = i - 1; - vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); - var_uv[j] = cpi->fn_ptr[uv_bsize].vf( - p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + flag_preduv_computed[i - 1] = 1; + var_uv[j] = cpi->fn_ptr[uv_bsize].vf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); - if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && - (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) - skip_uv[j] = 1; - else - break; + if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && + (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) + skip_uv[j] = 1; + else + break; + } else { + skip_uv[i - 1] = 1; + } } // If the transform in YUV planes are skippable, the mode search checks @@ -457,7 +507,6 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, if (skip_uv[0] & skip_uv[1]) { *early_term = 1; } - return; } @@ -590,24 +639,9 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, *out_dist_sum += dist << 4; } -#if CONFIG_VP9_HIGHBITDEPTH static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, int *skippable, int64_t *sse, BLOCK_SIZE bsize, - TX_SIZE tx_size) { - MACROBLOCKD *xd = &x->e_mbd; - unsigned int var_y, sse_y; - - (void)tx_size; - model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y, - &sse_y); - *sse = INT_MAX; - *skippable = 0; - return; -} -#else -static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, - int *skippable, int64_t *sse, BLOCK_SIZE bsize, - TX_SIZE tx_size) { + TX_SIZE tx_size, int rd_computed) { MACROBLOCKD *xd = &x->e_mbd; const struct macroblockd_plane *pd = &xd->plane[0]; struct macroblock_plane *const p = &x->plane[0]; @@ -624,6 +658,39 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, const int bw = 4 * num_4x4_w; const int bh = 4 * num_4x4_h; +#if CONFIG_VP9_HIGHBITDEPTH + // TODO(jingning): Implement the high bit-depth Hadamard transforms and + // remove this check condition. + // TODO(marpan): Use this path (model_rd) for 8bit under certain conditions + // for now, as the vp9_quantize_fp below for highbitdepth build is slow. + if (xd->bd != 8 || + (cpi->oxcf.speed > 5 && cpi->common.frame_type != KEY_FRAME && + bsize < BLOCK_32X32)) { + unsigned int var_y, sse_y; + (void)tx_size; + if (!rd_computed) + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, + &var_y, &sse_y); + *sse = INT_MAX; + *skippable = 0; + return; + } +#endif + + if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME && + (bsize < BLOCK_32X32 || + (cpi->use_svc && + (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) { + unsigned int var_y, sse_y; + (void)tx_size; + if (!rd_computed) + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, + &var_y, &sse_y); + *sse = INT_MAX; + *skippable = 0; + return; + } + (void)cpi; // The max tx_size passed in is TX_16X16. @@ -648,24 +715,21 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, switch (tx_size) { case TX_16X16: - vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff); - vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, + vpx_hadamard_16x16(src_diff, diff_stride, coeff); + vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: - vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff); - vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, + vpx_hadamard_8x8(src_diff, diff_stride, coeff); + vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; default: assert(0); break; @@ -699,7 +763,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, if (*eob == 1) this_rdc->rate += (int)abs(qcoeff[0]); else if (*eob > 1) - this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4); + this_rdc->rate += vpx_satd(qcoeff, step << 4); this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2; } @@ -711,7 +775,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT); this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT); } -#endif static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize, MACROBLOCK *x, MACROBLOCKD *xd, @@ -799,13 +862,11 @@ static void free_pred_buffer(PRED_BUFFER *p) { if (p != NULL) p->in_use = 0; } -static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col, - MV_REFERENCE_FRAME ref_frame, - PREDICTION_MODE this_mode, unsigned int var_y, - unsigned int sse_y, - struct buf_2d yv12_mb[][MAX_MB_PLANE], - int *rate, int64_t *dist) { +static void encode_breakout_test( + VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y, + unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate, + int64_t *dist, int *flag_preduv_computed) { MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); @@ -815,6 +876,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Skipping threshold for dc. unsigned int thresh_dc; int motion_low = 1; + if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return; if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 || mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64) motion_low = 0; @@ -865,9 +927,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, thresh_dc_uv = 0; } - // Skip UV prediction unless breakout is zero (lossless) to save - // computation with low impact on the result - if (x->encode_breakout == 0) { + if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) { xd->plane[1].pre[0] = yv12_mb[ref_frame][1]; xd->plane[2].pre[0] = yv12_mb[ref_frame][2]; vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); @@ -944,7 +1004,7 @@ static void estimate_block_intra(int plane, int block, int row, int col, int64_t this_sse = INT64_MAX; // TODO(jingning): This needs further refactoring. block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx, - VPXMIN(tx_size, TX_16X16)); + VPXMIN(tx_size, TX_16X16), 0); } else { unsigned int var = 0; unsigned int sse = 0; @@ -958,10 +1018,11 @@ static void estimate_block_intra(int plane, int block, int row, int col, args->rdc->dist += this_rdc.dist; } -static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = { +static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = { { THR_DC, THR_V_PRED, THR_H_PRED, THR_TM }, { THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV }, { THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG }, + { THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA }, }; static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED, @@ -981,6 +1042,32 @@ static int mode_offset(const PREDICTION_MODE mode) { } } +static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh, + const int *const thresh_fact) { + int is_rd_less_than_thresh; + is_rd_less_than_thresh = + best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; + return is_rd_less_than_thresh; +} + +static INLINE void update_thresh_freq_fact_row_mt( + VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance, + int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame, + THR_MODES best_mode_idx, PREDICTION_MODE mode) { + THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; + int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx; + int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx]; + if (thr_mode_idx == best_mode_idx) + *freq_fact -= (*freq_fact >> 4); + else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV && + ref_frame == LAST_FRAME && source_variance < 5) { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32); + } else { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } +} + static INLINE void update_thresh_freq_fact( VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance, BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx, @@ -1080,40 +1167,31 @@ typedef struct { PREDICTION_MODE pred_mode; } REF_MODE; -#define RT_INTER_MODES 8 +#define RT_INTER_MODES 12 static const REF_MODE ref_mode_set[RT_INTER_MODES] = { { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV }, { GOLDEN_FRAME, ZEROMV }, { LAST_FRAME, NEARMV }, { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEARESTMV }, - { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV } + { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV }, + { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV }, + { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV } }; -static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = { - { LAST_FRAME, ZEROMV }, { GOLDEN_FRAME, ZEROMV }, - { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV }, + +#define RT_INTER_MODES_SVC 8 +static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = { + { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV }, + { LAST_FRAME, NEARMV }, { GOLDEN_FRAME, ZEROMV }, { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV }, { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEWMV } }; -static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; - // Reduce the intra cost penalty for small blocks (<=16x16). - int reduction_fac = - (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; - if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh) - // Don't reduce intra cost penalty if estimated noise level is high. - reduction_fac = 0; - return vp9_get_intra_cost_penalty(cm->base_qindex, cm->y_dc_delta_q, - cm->bit_depth) >> - reduction_fac; -} - static INLINE void find_predictors( VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask, const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize, - int force_skip_low_temp_var) { + int force_skip_low_temp_var, int comp_pred_allowed) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); @@ -1127,7 +1205,7 @@ static INLINE void find_predictors( int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - if (cm->use_prev_frame_mvs) { + if (cm->use_prev_frame_mvs || comp_pred_allowed) { vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, x->mbmi_ext->mode_context); } else { @@ -1153,7 +1231,8 @@ static INLINE void find_predictors( static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd, PREDICTION_MODE this_mode, RD_COST *this_rdc, BLOCK_SIZE bsize, int mv_row, int mv_col, - int is_last_frame) { + int is_last_frame, int lowvar_highsumdiff, + int is_skin) { // Bias against MVs associated with NEWMV mode that are very different from // top/left neighbors. if (this_mode == NEWMV) { @@ -1200,9 +1279,12 @@ static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd, // If noise estimation is enabled, and estimated level is above threshold, // add a bias to LAST reference with small motion, for large blocks. if (ne->enabled && ne->level >= kMedium && bsize >= BLOCK_32X32 && - is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) { - this_rdc->rdcost = 7 * this_rdc->rdcost >> 3; - } + is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) + this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3); + else if (lowvar_highsumdiff && !is_skin && bsize >= BLOCK_16X16 && + is_last_frame && mv_row < 16 && mv_row > -16 && mv_col < 16 && + mv_col > -16) + this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3); } #if CONFIG_VP9_TEMPORAL_DENOISING @@ -1237,6 +1319,7 @@ static void recheck_zeromv_after_denoising( ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) && ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) || (ctx_den->best_ref_frame == GOLDEN_FRAME && + cpi->svc.number_spatial_layers == 1 && decision == FILTER_ZEROMV_BLOCK))) { // Check if we should pick ZEROMV on denoised signal. int rate = 0; @@ -1257,16 +1340,17 @@ static void recheck_zeromv_after_denoising( [INTER_OFFSET(ZEROMV)]; this_rdc.dist = dist; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist); - // Switch to ZEROMV if the rdcost for ZEROMV on denoised source - // is lower than best_ref mode (on original source). + // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source + // is higher than best_ref mode (on original source). if (this_rdc.rdcost > best_rdc->rdcost) { this_rdc = *best_rdc; mi->mode = ctx_den->best_mode; mi->ref_frame[0] = ctx_den->best_ref_frame; mi->interp_filter = ctx_den->best_pred_filter; - if (ctx_den->best_ref_frame == INTRA_FRAME) + if (ctx_den->best_ref_frame == INTRA_FRAME) { mi->mv[0].as_int = INVALID_MV; - else if (ctx_den->best_ref_frame == GOLDEN_FRAME) { + mi->interp_filter = SWITCHABLE_FILTERS; + } else if (ctx_den->best_ref_frame == GOLDEN_FRAME) { mi->mv[0].as_int = ctx_den->frame_mv[ctx_den->best_mode][ctx_den->best_ref_frame] .as_int; @@ -1343,10 +1427,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, struct macroblockd_plane *const pd = &xd->plane[0]; PREDICTION_MODE best_mode = ZEROMV; MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; - MV_REFERENCE_FRAME usable_ref_frame; + MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame; TX_SIZE best_tx_size = TX_SIZES; INTERP_FILTER best_pred_filter = EIGHTTAP; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; @@ -1355,11 +1440,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; - const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize]; - const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES; + const int *const rd_thresh_freq_fact = + (cpi->sf.adaptive_rd_thresh_row_mt) + ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx]) + : tile_data->thresh_freq_fact[bsize]; + INTERP_FILTER filter_ref; const int bsl = mi_width_log2_lookup[bsize]; const int pred_filter_search = @@ -1393,13 +1485,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int perform_intra_pred = 1; int use_golden_nonzeromv = 1; int force_skip_low_temp_var = 0; + int skip_ref_find_pred[4] = { 0 }; + unsigned int sse_zeromv_normalized = UINT_MAX; + unsigned int best_sse_sofar = UINT_MAX; + unsigned int thresh_svc_skip_golden = 500; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; + int denoise_svc_pickmode = 1; #endif + INTERP_FILTER filter_gf_svc = EIGHTTAP; + MV_REFERENCE_FRAME best_second_ref_frame = NONE; + int comp_modes = 0; + int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES; + int flag_svc_subpel = 0; + int svc_mv_col = 0; + int svc_mv_row = 0; init_ref_frame_cost(cm, xd, ref_frame_cost); + memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES); + if (reuse_inter_pred) { int i; for (i = 0; i < 3; i++) { @@ -1455,18 +1561,43 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && - cpi->denoiser.denoising_level > kDenLowLow) { - vp9_denoiser_reset_frame_stats(ctx); + if (cpi->oxcf.noise_sensitivity > 0) { + if (cpi->use_svc) { + int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, + cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + denoise_svc_pickmode = denoise_svc(cpi) && !lc->is_key_frame; + } + if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) + vp9_denoiser_reset_frame_stats(ctx); } #endif - if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) { + if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc && + !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) { usable_ref_frame = LAST_FRAME; } else { usable_ref_frame = GOLDEN_FRAME; } + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) { + if (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref) + usable_ref_frame = ALTREF_FRAME; + + if (cpi->rc.is_src_frame_alt_ref) { + skip_ref_find_pred[LAST_FRAME] = 1; + skip_ref_find_pred[GOLDEN_FRAME] = 1; + } + if (!cm->show_frame) { + if (cpi->rc.frames_since_key == 1) { + usable_ref_frame = LAST_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 1; + skip_ref_find_pred[ALTREF_FRAME] = 1; + } + } + } + // For svc mode, on spatial_layer_id > 0: if the reference has different scale // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && @@ -1484,19 +1615,54 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->sf.short_circuit_low_temp_var) { force_skip_low_temp_var = get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize); + // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3, + // skip golden reference. + if ((cpi->sf.short_circuit_low_temp_var == 1 || + cpi->sf.short_circuit_low_temp_var == 3) && + force_skip_low_temp_var) { + usable_ref_frame = LAST_FRAME; + } } if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; + if (cpi->oxcf.speed >= 8 && !cpi->use_svc && + ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content || + x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120)) + usable_ref_frame = LAST_FRAME; + + // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF. + if (cm->reference_mode == REFERENCE_MODE_SELECT && + cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME) + comp_modes = 2; + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { - find_predictors(cpi, x, ref_frame, frame_mv, const_motion, - &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col, - yv12_mb, bsize, force_skip_low_temp_var); + if (!skip_ref_find_pred[ref_frame]) { + find_predictors(cpi, x, ref_frame, frame_mv, const_motion, + &ref_frame_skip_mask, flag_list, tile_data, mi_row, + mi_col, yv12_mb, bsize, force_skip_low_temp_var, + comp_modes > 0); + } } - for (idx = 0; idx < RT_INTER_MODES; ++idx) { + if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32) + x->sb_use_mv_part = 0; + + // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used + // an averaging filter for downsampling (phase = 8). If so, we will test + // a nonzero motion mode on the spatial (goldeen) reference. + // The nonzero motion is half pixel shifted to left and top (-4, -4). + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && + svc_force_zero_mode[GOLDEN_FRAME - 1] && + cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) { + svc_mv_col = -4; + svc_mv_row = -4; + flag_svc_subpel = 1; + } + + for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) { int rate_mv = 0; int mode_rd_thresh; int mode_index; @@ -1504,9 +1670,63 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int64_t this_sse; int is_skippable; int this_early_term = 0; - PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; + int rd_computed = 0; + int flag_preduv_computed[2] = { 0 }; + int inter_mv_mode = 0; + int skip_this_mv = 0; + int comp_pred = 0; + int force_gf_mv = 0; + PREDICTION_MODE this_mode; + second_ref_frame = NONE; - if (cpi->use_svc) this_mode = ref_mode_set_svc[idx].pred_mode; + if (idx < num_inter_modes) { + this_mode = ref_mode_set[idx].pred_mode; + ref_frame = ref_mode_set[idx].ref_frame; + + if (cpi->use_svc) { + this_mode = ref_mode_set_svc[idx].pred_mode; + ref_frame = ref_mode_set_svc[idx].ref_frame; + } + } else { + // Add (0,0) compound modes. + this_mode = ZEROMV; + ref_frame = LAST_FRAME; + if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME; + second_ref_frame = ALTREF_FRAME; + comp_pred = 1; + } + + if (ref_frame > usable_ref_frame) continue; + if (skip_ref_find_pred[ref_frame]) continue; + + if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) { + force_gf_mv = 1; + // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), + // otherwise set NEWMV to (svc_mv_col, svc_mv_row). + if (this_mode == NEWMV) { + frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col; + frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row; + } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col || + frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) { + continue; + } + } + + if (comp_pred) { + const struct segmentation *const seg = &cm->seg; + if (!cpi->allow_comp_inter_inter) continue; + // Skip compound inter modes if ARF is not available. + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue; + } + + // For SVC, skip the golden (spatial) reference search if sse of zeromv_last + // is below threshold. + if (cpi->use_svc && ref_frame == GOLDEN_FRAME && + sse_zeromv_normalized < thresh_svc_skip_golden) + continue; if (sf->short_circuit_flat_blocks && x->source_variance == 0 && this_mode != NEARESTMV) { @@ -1515,9 +1735,28 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; - ref_frame = ref_mode_set[idx].ref_frame; - if (cpi->use_svc) { - ref_frame = ref_mode_set_svc[idx].ref_frame; + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) { + if (cpi->rc.is_src_frame_alt_ref && + (ref_frame != ALTREF_FRAME || + frame_mv[this_mode][ref_frame].as_int != 0)) + continue; + + if (!cm->show_frame && ref_frame == ALTREF_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + + if (cpi->rc.alt_ref_gf_group && cm->show_frame && + cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) && + ref_frame == GOLDEN_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + + if (cpi->rc.alt_ref_gf_group && cm->show_frame && + cpi->rc.frames_since_golden > 0 && + cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) && + ref_frame == ALTREF_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; } if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; @@ -1527,38 +1766,58 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped // later. - if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) { continue; } - if (cpi->sf.short_circuit_low_temp_var == 2 && force_skip_low_temp_var && - ref_frame == LAST_FRAME && this_mode == NEWMV) { + if (x->content_state_sb != kVeryHighSad && + (cpi->sf.short_circuit_low_temp_var >= 2 || + (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) && + force_skip_low_temp_var && ref_frame == LAST_FRAME && + this_mode == NEWMV) { continue; } if (cpi->use_svc) { - if (svc_force_zero_mode[ref_frame - 1] && + if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } - if (!force_skip_low_temp_var && + if (sf->reference_masking && !(frame_mv[this_mode][ref_frame].as_int == 0 && ref_frame == LAST_FRAME)) { - i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking) - if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + if (usable_ref_frame < ALTREF_FRAME) { + if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { + i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; + if ((cpi->ref_frame_flags & flag_list[i])) + if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + ref_frame_skip_mask |= (1 << ref_frame); + } + } else if (!cpi->rc.is_src_frame_alt_ref && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == ALTREF_FRAME)) { + int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; + int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; + if (((cpi->ref_frame_flags & flag_list[ref1]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || + ((cpi->ref_frame_flags & flag_list[ref2]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) ref_frame_skip_mask |= (1 << ref_frame); + } } if (ref_frame_skip_mask & (1 << ref_frame)) continue; // Select prediction reference frames. - for (i = 0; i < MAX_MB_PLANE; i++) + for (i = 0; i < MAX_MB_PLANE; i++) { xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } mi->ref_frame[0] = ref_frame; - set_ref_ptrs(cm, xd, ref_frame, NONE); + mi->ref_frame[1] = second_ref_frame; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1 @@ -1570,16 +1829,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->rc.frames_since_golden > 4) mode_rd_thresh = mode_rd_thresh << 3; - if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, - rd_thresh_freq_fact[mode_index])) + if ((cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index])) || + (!cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index]))) continue; - if (this_mode == NEWMV) { + if (this_mode == NEWMV && !force_gf_mv) { if (ref_frame > LAST_FRAME && !cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR) { int tmp_sad; uint32_t dis; - int cost_list[5]; + int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; if (bsize < BLOCK_16X16) continue; @@ -1607,17 +1870,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } else if (svc->use_base_mv && svc->spatial_layer_id) { if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { const int pre_stride = xd->plane[0].pre[0].stride; - int base_mv_sad = INT_MAX; - const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f; + unsigned int base_mv_sse = UINT_MAX; + int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; const uint8_t *const pre_buf = xd->plane[0].pre[0].buf + (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); - base_mv_sad = cpi->fn_ptr[bsize].sdf( - x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride); + cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + pre_buf, pre_stride, &base_mv_sse); - if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) { + // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, + // for SVC encoding. + if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + continue; + + // Exit NEWMV search if base_mv_sse is large. + if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) + continue; + if (base_mv_sse < (best_sse_sofar << 1)) { // Base layer mv is good. + // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since + // (0, 0) mode is already tested. + unsigned int base_mv_sse_normalized = + base_mv_sse >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && + base_mv_sse_normalized < 400 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + continue; if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 1)) { @@ -1640,6 +1923,22 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } } + // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector + // causes some regression, leave it for duplicate zero-mv for now, until + // regression issue is resolved. + for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) { + if (inter_mv_mode == this_mode || comp_pred) continue; + if (mode_checked[inter_mv_mode][ref_frame] && + frame_mv[this_mode][ref_frame].as_int == + frame_mv[inter_mv_mode][ref_frame].as_int && + frame_mv[inter_mv_mode][ref_frame].as_int == 0) { + skip_this_mv = 1; + break; + } + } + + if (skip_this_mv) continue; + // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no // need to compute best_pred_sad which is only used to skip golden NEWMV. if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME && @@ -1654,13 +1953,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, x->pred_mv_sad[LAST_FRAME] = best_pred_sad; } - if (this_mode != NEARESTMV && + if (this_mode != NEARESTMV && !comp_pred && frame_mv[this_mode][ref_frame].as_int == frame_mv[NEARESTMV][ref_frame].as_int) continue; mi->mode = this_mode; mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; + mi->mv[1].as_int = 0; // Search for the best prediction filter type, when the resulting // motion vector is at sub-pixel accuracy level for luma component, i.e., @@ -1678,17 +1978,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search && (ref_frame == LAST_FRAME || - (ref_frame == GOLDEN_FRAME && + (ref_frame == GOLDEN_FRAME && !force_gf_mv && (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) { int pf_rate[3]; int64_t pf_dist[3]; + int curr_rate[3]; unsigned int pf_var[3]; unsigned int pf_sse[3]; TX_SIZE pf_tx_size[3]; int64_t best_cost = INT64_MAX; INTERP_FILTER best_filter = SWITCHABLE, filter; PRED_BUFFER *current_pred = this_mode_pred; + rd_computed = 1; for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) { int64_t cost; @@ -1696,6 +1998,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], &pf_var[filter], &pf_sse[filter]); + curr_rate[filter] = pf_rate[filter]; pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); pf_tx_size[filter] = mi->tx_size; @@ -1721,7 +2024,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, mi->interp_filter = best_filter; mi->tx_size = pf_tx_size[best_filter]; - this_rdc.rate = pf_rate[best_filter]; + this_rdc.rate = curr_rate[best_filter]; this_rdc.dist = pf_dist[best_filter]; var_y = pf_var[best_filter]; sse_y = pf_sse[best_filter]; @@ -1731,16 +2034,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, pd->dst.stride = this_mode_pred->stride; } } else { -// TODO(jackychen): the low-bitdepth condition causes a segfault in -// high-bitdepth builds. -// https://bugs.chromium.org/p/webm/issues/detail?id=1250 -#if CONFIG_VP9_HIGHBITDEPTH - const int large_block = bsize > BLOCK_32X32; -#else + // For low motion content use x->sb_is_skin in addition to VeryHighSad + // for setting large_block. const int large_block = - x->sb_is_skin ? bsize > BLOCK_32X32 : bsize >= BLOCK_32X32; -#endif + (x->content_state_sb == kVeryHighSad || + (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) || + cpi->oxcf.speed < 7) + ? bsize > BLOCK_32X32 + : bsize >= BLOCK_32X32; mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref; + + if (cpi->use_svc && ref_frame == GOLDEN_FRAME && + svc_force_zero_mode[ref_frame - 1]) + mi->interp_filter = filter_gf_svc; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); // For large partition blocks, extra testing is done. @@ -1749,17 +2056,26 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cm->base_qindex) { model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col, - &this_early_term); + &this_early_term, flag_preduv_computed); } else { + rd_computed = 1; model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, &var_y, &sse_y); } + // Save normalized sse (between current and last frame) for (0, 0) motion. + if (cpi->use_svc && ref_frame == LAST_FRAME && + frame_mv[this_mode][ref_frame].as_int == 0) { + sse_zeromv_normalized = + sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + } + if (sse_y < best_sse_sofar) best_sse_sofar = sse_y; } if (!this_early_term) { this_sse = (int64_t)sse_y; block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize, - VPXMIN(mi->tx_size, TX_16X16)); + VPXMIN(mi->tx_size, TX_16X16), rd_computed); + x->skip_txfm[0] = is_skippable; if (is_skippable) { this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); @@ -1785,13 +2101,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); } - if (x->color_sensitivity[0] || x->color_sensitivity[1]) { + if (!this_early_term && + (x->color_sensitivity[0] || x->color_sensitivity[1])) { RD_COST rdc_uv; const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]); - if (x->color_sensitivity[0]) + if (x->color_sensitivity[0] && !flag_preduv_computed[0]) { vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1); - if (x->color_sensitivity[1]) + flag_preduv_computed[0] = 1; + } + if (x->color_sensitivity[1] && !flag_preduv_computed[1]) { vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2); + flag_preduv_computed[1] = 1; + } model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2); this_rdc.rate += rdc_uv.rate; this_rdc.dist += rdc_uv.dist; @@ -1800,6 +2121,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_rdc.rate += rate_mv; this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]] [INTER_OFFSET(this_mode)]; + // TODO(marpan): Add costing for compound mode. this_rdc.rate += ref_frame_cost[ref_frame]; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); @@ -1810,7 +2132,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vp9_NEWMV_diff_bias(&cpi->noise_estimate, xd, this_mode, &this_rdc, bsize, frame_mv[this_mode][ref_frame].as_mv.row, frame_mv[this_mode][ref_frame].as_mv.col, - ref_frame == LAST_FRAME); + ref_frame == LAST_FRAME, x->lowvar_highsumdiff, + x->sb_is_skin); } // Skipping checking: test to see if this block can be reconstructed by @@ -1818,7 +2141,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->allow_encode_breakout) { encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode, var_y, sse_y, yv12_mb, &this_rdc.rate, - &this_rdc.dist); + &this_rdc.dist, flag_preduv_computed); if (x->skip) { this_rdc.rate += rate_mv; this_rdc.rdcost = @@ -1827,7 +2150,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow) { vp9_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx); // Keep track of zero_last cost. @@ -1838,6 +2161,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (void)ctx; #endif + mode_checked[this_mode][ref_frame] = 1; + if (this_rdc.rdcost < best_rdc.rdcost || x->skip) { best_rdc = this_rdc; best_mode = this_mode; @@ -1846,6 +2171,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, best_ref_frame = ref_frame; best_mode_skip_txfm = x->skip_txfm[0]; best_early_term = this_early_term; + best_second_ref_frame = second_ref_frame; if (reuse_inter_pred) { free_pred_buffer(best_pred); @@ -1872,6 +2198,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int; x->skip_txfm[0] = best_mode_skip_txfm; + mi->ref_frame[1] = best_second_ref_frame; // For spatial enhancemanent layer: perform intra prediction only if base // layer is chosen as the reference. Always perform intra prediction if @@ -1884,12 +2211,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, svc_force_zero_mode[best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) + perform_intra_pred = 0; // Perform intra prediction search, if the best SAD is above a certain // threshold. - if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) && perform_intra_pred && - (best_rdc.rdcost == INT64_MAX || - (!x->skip && best_rdc.rdcost > inter_mode_thresh && - bsize <= cpi->sf.max_intra_bsize))) { + if (best_rdc.rdcost == INT64_MAX || + ((!force_skip_low_temp_var || bsize < BLOCK_32X32 || + x->content_state_sb == kVeryHighSad) && + perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad && + !x->lowvar_highsumdiff)) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; int i; TX_SIZE best_intra_tx_size = TX_SIZES; @@ -1904,17 +2236,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, - this_mode_pred->data, this_mode_pred->stride, - NULL, 0, NULL, 0, bw, bh, xd->bd); + vpx_highbd_convolve_copy( + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride, + NULL, 0, 0, 0, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, - 0, NULL, 0, bw, bh); + 0, 0, 0, 0, bw, bh); #else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, 0, - NULL, 0, bw, bh); + 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH best_pred = this_mode_pred; } @@ -1933,8 +2266,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize])) continue; - if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, - rd_thresh_freq_fact[mode_index])) + if ((cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index])) || + (!cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index]))) continue; mi->mode = this_mode; @@ -1975,8 +2312,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, best_mode = this_mode; best_intra_tx_size = mi->tx_size; best_ref_frame = INTRA_FRAME; + best_second_ref_frame = NONE; mi->uv_mode = this_mode; mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; best_mode_skip_txfm = x->skip_txfm[0]; } } @@ -1992,6 +2331,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, pd->dst = orig_dst; mi->mode = best_mode; mi->ref_frame[0] = best_ref_frame; + mi->ref_frame[1] = best_second_ref_frame; x->skip_txfm[0] = best_mode_skip_txfm; if (!is_inter_block(mi)) { @@ -2002,23 +2342,30 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, - pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, - bw, bh, xd->bd); + vpx_highbd_convolve_copy( + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0, + bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, - pd->dst.stride, NULL, 0, NULL, 0, bw, bh); + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); #else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, - pd->dst.stride, NULL, 0, NULL, 0, bw, bh); + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH } } #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && cpi->resize_pending == 0 && - cpi->denoiser.denoising_level > kDenLowLow && cpi->denoiser.reset == 0) { + denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow && + cpi->denoiser.reset == 0) { VP9_DENOISER_DECISION decision = COPY_BLOCK; + ctx->sb_skip_denoising = 0; + // TODO(marpan): There is an issue with denoising when the + // superblock partitioning scheme is based on the pickmode. + // Remove this condition when the issue is resolved. + if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1; vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost, frame_mv, reuse_inter_pred, best_tx_size, best_mode, best_ref_frame, best_pred_filter, @@ -2030,6 +2377,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } #endif + if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME) + x->arf_frame_usage++; + else if (best_ref_frame != INTRA_FRAME) + x->lastgolden_frame_usage++; + if (cpi->sf.adaptive_rd_thresh) { THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)]; @@ -2041,16 +2393,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // TODO(yunqingwang): Check intra mode mask and only update freq_fact // for those valid modes. for (i = 0; i < intra_modes; i++) { - update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, - INTRA_FRAME, best_mode_idx, intra_mode_list[i]); + if (cpi->sf.adaptive_rd_thresh_row_mt) + update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, + thresh_freq_fact_idx, INTRA_FRAME, + best_mode_idx, intra_mode_list[i]); + else + update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, + INTRA_FRAME, best_mode_idx, + intra_mode_list[i]); } } else { for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { PREDICTION_MODE this_mode; if (best_ref_frame != ref_frame) continue; for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { - update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, - ref_frame, best_mode_idx, this_mode); + if (cpi->sf.adaptive_rd_thresh_row_mt) + update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, + thresh_freq_fact_idx, ref_frame, + best_mode_idx, this_mode); + else + update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, + ref_frame, best_mode_idx, this_mode); } } } @@ -2197,12 +2560,12 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, } vp9_set_mv_search_range(&x->mv_limits, - &mbmi_ext->ref_mvs[0]->as_mv); + &mbmi_ext->ref_mvs[ref_frame][0].as_mv); - vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - x->sadperbit4, cond_cost_list(cpi, cost_list), - &mbmi_ext->ref_mvs[ref_frame][0].as_mv, - &tmp_mv, INT_MAX, 0); + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, + x->sadperbit4, cond_cost_list(cpi, cost_list), + &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv, INT_MAX, 0); x->mv_limits = tmp_mv_limits; @@ -2235,7 +2598,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_build_inter_predictor( - pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride, + CONVERT_TO_SHORTPTR(pd->pre[0].buf), pd->pre[0].stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf, 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0, vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3, diff --git a/libs/libvpx/vp9/encoder/vp9_quantize.c b/libs/libvpx/vp9/encoder/vp9_quantize.c index de96c6e068..09f61ead26 100644 --- a/libs/libvpx/vp9/encoder/vp9_quantize.c +++ b/libs/libvpx/vp9/encoder/vp9_quantize.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" @@ -21,77 +22,67 @@ #include "vp9/encoder/vp9_rd.h" void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, eob = -1; - // TODO(jingning) Decide the need of these arguments after the - // quantization process is completed. - (void)zbin_ptr; - (void)quant_shift_ptr; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant_ptr[rc != 0]) >> 16; + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (tmp) eob = i; - } + if (tmp) eob = i; } *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i; int eob = -1; - // TODO(jingning) Decide the need of these arguments after the - // quantization process is completed. - (void)zbin_ptr; - (void)quant_shift_ptr; + (void)iscan; + (void)skip_block; + assert(!skip_block); - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + round_ptr[rc != 0]; - const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) eob = i; - } + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[rc != 0]; + const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; } *eob_ptr = eob + 1; } @@ -100,38 +91,35 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, // TODO(jingning) Refactor this file and combine functions with similar // operations. void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, eob = -1; - (void)zbin_ptr; - (void)quant_shift_ptr; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - int tmp = 0; - int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp = 0; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); - tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - } - - if (tmp) eob = i; + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; } + + if (tmp) eob = i; } *eob_ptr = eob + 1; } @@ -139,36 +127,33 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_quantize_fp_32x32_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, eob = -1; - (void)zbin_ptr; - (void)quant_shift_ptr; + (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - for (i = 0; i < n_coeffs; i++) { - uint32_t abs_qcoeff = 0; - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + for (i = 0; i < n_coeffs; i++) { + int abs_qcoeff = 0; + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { - const int64_t tmp = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - } - - if (abs_qcoeff) eob = i; + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; } + + if (abs_qcoeff) eob = i; } *eob_ptr = eob + 1; } @@ -179,22 +164,28 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; + tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block), + *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const int n_coeffs = 4 * 4; + + if (x->skip_block) { + memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff)); + memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff)); + return; + } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, - p->zbin, p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(p->qcoeff, block), - BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, + vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, + x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, &p->eobs[block], scan, iscan); return; } #endif - vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin, - p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(p->qcoeff, block), - BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block], - scan, iscan); + vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block, + p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, &p->eobs[block], scan, iscan); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.c b/libs/libvpx/vp9/encoder/vp9_ratectrl.c index 93eddd655a..b7f3a0e897 100644 --- a/libs/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.c @@ -15,6 +15,7 @@ #include #include +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -43,8 +44,6 @@ #define MIN_BPB_FACTOR 0.005 #define MAX_BPB_FACTOR 50 -#define FRAME_OVERHEAD_BITS 200 - #if CONFIG_VP9_HIGHBITDEPTH #define ASSIGN_MINQ_TABLE(bit_depth, name) \ do { \ @@ -90,10 +89,17 @@ static int inter_minq_12[QINDEX_RANGE]; static int rtc_minq_12[QINDEX_RANGE]; #endif +#ifdef AGGRESSIVE_VBR +static int gf_high = 2400; +static int gf_low = 400; +static int kf_high = 4000; +static int kf_low = 400; +#else static int gf_high = 2000; static int gf_low = 400; static int kf_high = 5000; static int kf_low = 400; +#endif // Functions to compute the active minq lookup table entries based on a // formulaic approach to facilitate easier adjustment of the Q tables. @@ -123,9 +129,14 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, const double maxq = vp9_convert_qindex_to_q(i, bit_depth); kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); +#ifdef AGGRESSIVE_VBR + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth); +#else arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); - arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); +#endif + arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); } } @@ -163,6 +174,17 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) { #endif } +int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth) { + int i; + + for (i = 0; i < QINDEX_RANGE; ++i) + if (vp9_convert_qindex_to_q(i, bit_depth) >= q_val) break; + + if (i == QINDEX_RANGE) i--; + + return i; +} + int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, double correction_factor, vpx_bit_depth_t bit_depth) { const double q = vp9_convert_qindex_to_q(qindex, bit_depth); @@ -182,24 +204,29 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth)); return VPXMAX(FRAME_OVERHEAD_BITS, - (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); + (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS)); } int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; const VP9EncoderConfig *oxcf = &cpi->oxcf; - const int min_frame_target = - VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); - if (target < min_frame_target) target = min_frame_target; - if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { - // If there is an active ARF at this location use the minimum - // bits on this frame even if it is a constructed arf. - // The active maximum quantizer insures that an appropriate - // number of bits will be spent if needed for constructed ARFs. - target = min_frame_target; + + if (cpi->oxcf.pass != 2) { + const int min_frame_target = + VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); + if (target < min_frame_target) target = min_frame_target; + if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; + } } + // Clip the frame target to the maximum allowed value. if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_max_inter_bitrate_pct) { const int max_rate = rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; @@ -326,7 +353,10 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->af_ratio_onepass_vbr = 10; rc->prev_avg_source_sad_lag = 0; rc->high_source_sad = 0; + rc->reset_high_source_sad = 0; rc->high_source_sad_lagindex = -1; + rc->alt_ref_gf_group = 0; + rc->last_frame_is_src_altref = 0; rc->fac_active_worst_inter = 150; rc->fac_active_worst_gf = 100; rc->force_qpmin = 0; @@ -410,7 +440,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) { } else { if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && !rc->is_src_frame_alt_ref && !cpi->use_svc && - (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20)) + (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100)) rcf = rc->rate_correction_factors[GF_ARF_STD]; else rcf = rc->rate_correction_factors[INTER_NORMAL]; @@ -436,7 +466,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { } else { if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && !rc->is_src_frame_alt_ref && !cpi->use_svc && - (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20)) + (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100)) rc->rate_correction_factors[GF_ARF_STD] = factor; else rc->rate_correction_factors[INTER_NORMAL] = factor; @@ -519,6 +549,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, int active_best_quality, int active_worst_quality) { const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int q = active_worst_quality; int last_error = INT_MAX; int i, target_bits_per_mb, bits_per_mb_at_this_q; @@ -533,7 +564,8 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, do { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && - cpi->svc.temporal_layer_id == 0) { + cr->apply_cyclic_refresh && + (!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) { bits_per_mb_at_this_q = (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); } else { @@ -555,7 +587,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, // In CBR mode, this makes sure q is between oscillating Qs to prevent // resonance. - if (cpi->oxcf.rc_mode == VPX_CBR && + if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad && + (!cpi->oxcf.gf_cbr_boost_pct || + !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && cpi->rc.q_1_frame != cpi->rc.q_2_frame) { q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), @@ -640,7 +674,8 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { int active_worst_quality; int ambient_qp; unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; - if (cm->frame_type == KEY_FRAME) return rc->worst_quality; + if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad) + return rc->worst_quality; // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] // for the first few frames following key frame. These are both initialized // to worst_quality and updated with (3/4, 1/4) average in postencode_update. @@ -650,6 +685,17 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) : rc->avg_frame_qindex[INTER_FRAME]; + // For SVC if the current base spatial layer was key frame, use the QP from + // that base layer for ambient_qp. + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { + int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (lc->is_key_frame) { + const RATE_CONTROL *lrc = &lc->rc; + ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]); + } + } active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2); if (rc->buffer_level > rc->optimal_buffer_level) { // Adjust down. @@ -961,6 +1007,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, qdelta = vp9_compute_qdelta_by_rate( &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth); } + if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0; *top_index = active_worst_quality + qdelta; *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; } @@ -1289,6 +1336,28 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { } } +static void update_altref_usage(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + int sum_ref_frame_usage = 0; + int arf_frame_usage = 0; + int mi_row, mi_col; + if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref && + !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) { + int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] + + cpi->count_lastgolden_frame_usage[sboffset]; + arf_frame_usage += cpi->count_arf_frame_usage[sboffset]; + } + } + if (sum_ref_frame_usage > 0) { + double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage; + cpi->rc.perc_arf_usage = + 0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count; + } +} + static void compute_frame_low_motion(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; int mi_row, mi_col; @@ -1314,10 +1383,6 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { RATE_CONTROL *const rc = &cpi->rc; const int qindex = cm->base_qindex; - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { - vp9_cyclic_refresh_postencode(cpi); - } - // Update rate control heuristics rc->projected_frame_size = (int)(bytes_used << 3); @@ -1416,8 +1481,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } if (oxcf->pass == 0) { - if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi); + if (cm->frame_type != KEY_FRAME) { + compute_frame_low_motion(cpi); + if (cpi->sf.use_altref_onepass) update_altref_usage(cpi); + } + cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref; } + if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0; + + rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { @@ -1429,24 +1501,16 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { cpi->rc.rc_1_frame = 0; } -// Use this macro to turn on/off use of alt-refs in one-pass mode. -#define USE_ALTREF_FOR_ONE_PASS 1 - static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; - int target; const int af_ratio = rc->af_ratio_onepass_vbr; -#if USE_ALTREF_FOR_ONE_PASS - target = + int target = (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / (rc->baseline_gf_interval + af_ratio - 1) : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) / (rc->baseline_gf_interval + af_ratio - 1); -#else - target = rc->avg_frame_bandwidth; -#endif return vp9_rc_clamp_pframe_target_size(cpi, target); } @@ -1499,8 +1563,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) { vp9_cyclic_refresh_set_golden_update(cpi); } else { - rc->baseline_gf_interval = - (rc->min_gf_interval + rc->max_gf_interval) / 2; + rc->baseline_gf_interval = VPXMIN( + 20, VPXMAX(10, (rc->min_gf_interval + rc->max_gf_interval) / 2)); } rc->af_ratio_onepass_vbr = 10; if (rc->rolling_target_bits > 0) @@ -1518,14 +1582,20 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { // Adjust boost and af_ratio based on avg_frame_low_motion, which varies // between 0 and 100 (stationary, 100% zero/small motion). rc->gfu_boost = - VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / - (rc->avg_frame_low_motion + 100)); + VPXMAX(500, + DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / + (rc->avg_frame_low_motion + 100)); rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } adjust_gfint_frame_constraint(cpi, rc->frames_to_key); rc->frames_till_gf_update_due = rc->baseline_gf_interval; cpi->refresh_golden_frame = 1; - rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; + rc->source_alt_ref_pending = 0; + rc->alt_ref_gf_group = 0; + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) { + rc->source_alt_ref_pending = 1; + rc->alt_ref_gf_group = 1; + } } if (cm->frame_type == KEY_FRAME) target = calc_iframe_target_size_one_pass_vbr(cpi); @@ -1802,6 +1872,26 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, // Clamp min to max rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval); + + if (oxcf->target_level == LEVEL_AUTO) { + const uint32_t pic_size = cpi->common.width * cpi->common.height; + const uint32_t pic_breadth = + VPXMAX(cpi->common.width, cpi->common.height); + int i; + for (i = LEVEL_1; i < LEVEL_MAX; ++i) { + if (vp9_level_defs[i].max_luma_picture_size >= pic_size && + vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { + if (rc->min_gf_interval <= + (int)vp9_level_defs[i].min_altref_distance) { + rc->min_gf_interval = + (int)vp9_level_defs[i].min_altref_distance + 1; + rc->max_gf_interval = + VPXMAX(rc->max_gf_interval, rc->min_gf_interval); + } + break; + } + } + } } } @@ -1888,9 +1978,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) { else target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); - // Correction to rate target based on prior over or under shoot. - if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) - vbr_rate_correction(cpi, &target_rate); + if (!cpi->oxcf.vbr_corpus_complexity) { + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) + vbr_rate_correction(cpi, &target_rate); + } vp9_rc_set_frame_target(cpi, target_rate); } @@ -2025,7 +2117,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { return resize_action; } -void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { +static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, + uint64_t avg_sad_current) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; @@ -2036,7 +2129,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { uint64_t avg_source_sad_lag = avg_sad_current; int high_source_sad_lagindex = -1; int steady_sad_lagindex = -1; - uint32_t sad_thresh1 = 60000; + uint32_t sad_thresh1 = 70000; uint32_t sad_thresh2 = 120000; int low_content = 0; int high_content = 0; @@ -2088,8 +2181,8 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->high_source_sad_lagindex = high_source_sad_lagindex; // Adjust some factors for the next GF group, ignore initial key frame, // and only for lag_in_frames not too small. - if (cpi->refresh_golden_frame == 1 && cm->frame_type != KEY_FRAME && - cm->current_video_frame > 30 && cpi->oxcf.lag_in_frames > 8) { + if (cpi->refresh_golden_frame == 1 && cm->current_video_frame > 30 && + cpi->oxcf.lag_in_frames > 8) { int frame_constraint; if (rc->rolling_target_bits > 0) rate_err = @@ -2110,6 +2203,8 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { ? VPXMAX(10, rc->baseline_gf_interval >> 1) : VPXMAX(6, rc->baseline_gf_interval >> 1); } + if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames - 1) + rc->baseline_gf_interval = cpi->oxcf.lag_in_frames - 1; // Check for constraining gf_interval for up-coming scene/content changes, // or for up-coming key frame, whichever is closer. frame_constraint = rc->frames_to_key; @@ -2123,9 +2218,14 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { // Adjust factors for active_worst setting & af_ratio for next gf interval. rc->fac_active_worst_inter = 150; // corresponds to 3/2 (= 150 /100). rc->fac_active_worst_gf = 100; - if (rate_err < 1.5 && !high_content) { + if (rate_err < 2.0 && !high_content) { rc->fac_active_worst_inter = 120; rc->fac_active_worst_gf = 90; + } else if (rate_err > 8.0 && rc->avg_frame_qindex[INTER_FRAME] < 16) { + // Increase active_worst faster at low Q if rate fluctuation is high. + rc->fac_active_worst_inter = 200; + if (rc->avg_frame_qindex[INTER_FRAME] < 8) + rc->fac_active_worst_inter = 400; } if (low_content && rc->avg_frame_low_motion > 80) { rc->af_ratio_onepass_vbr = 15; @@ -2133,6 +2233,30 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->af_ratio_onepass_vbr = 5; rc->gfu_boost = DEFAULT_GF_BOOST >> 2; } + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) { + // Flag to disable usage of ARF based on past usage, only allow this + // disabling if current frame/group does not start with key frame or + // scene cut. Note perc_arf_usage is only computed for speed >= 5. + int arf_usage_low = + (cm->frame_type != KEY_FRAME && !rc->high_source_sad && + cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5); + // Don't use alt-ref for this group under certain conditions. + if (arf_usage_low || + (rc->high_source_sad_lagindex > 0 && + rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) || + (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) { + rc->source_alt_ref_pending = 0; + rc->alt_ref_gf_group = 0; + } else { + rc->source_alt_ref_pending = 1; + rc->alt_ref_gf_group = 1; + // If alt-ref is used for this gf group, limit the interval. + if (rc->baseline_gf_interval > 12) { + rc->baseline_gf_interval = 12; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + } + } + } target = calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); } @@ -2144,9 +2268,12 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { // in content and allow rate control to react. // This function also handles special case of lag_in_frames, to measure content // level in #future frames set by the lag_in_frames. -void vp9_avg_source_sad(VP9_COMP *cpi) { +void vp9_scene_detection_onepass(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) return; +#endif rc->high_source_sad = 0; if (cpi->Last_Source != NULL && cpi->Last_Source->y_width == cpi->Source->y_width && @@ -2159,11 +2286,14 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { int start_frame = 0; int frames_to_buffer = 1; int frame = 0; + int scene_cut_force_key_frame = 0; uint64_t avg_sad_current = 0; uint32_t min_thresh = 4000; float thresh = 8.0f; + uint32_t thresh_key = 140000; + if (cpi->oxcf.speed <= 5) thresh_key = 240000; if (cpi->oxcf.rc_mode == VPX_VBR) { - min_thresh = 60000; + min_thresh = 65000; thresh = 2.1f; } if (cpi->oxcf.lag_in_frames > 0) { @@ -2189,6 +2319,8 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { rc->high_source_sad = 1; else rc->high_source_sad = 0; + if (rc->high_source_sad && avg_sad_current > thresh_key) + scene_cut_force_key_frame = 1; // Update recursive average for current frame. if (avg_sad_current > 0) rc->avg_source_sad[0] = @@ -2208,6 +2340,7 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { const BLOCK_SIZE bsize = BLOCK_64X64; // Loop over sub-sample of frame, compute average sad over 64x64 blocks. uint64_t avg_sad = 0; + uint64_t tmp_sad = 0; int num_samples = 0; int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; @@ -2220,13 +2353,14 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { // Checker-board pattern, ignore boundary. - if ((sbi_row > 0 && sbi_col > 0) && - (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) && - ((sbi_row % 2 == 0 && sbi_col % 2 == 0) || - (sbi_row % 2 != 0 && sbi_col % 2 != 0))) { + if (((sbi_row > 0 && sbi_col > 0) && + (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) && + ((sbi_row % 2 == 0 && sbi_col % 2 == 0) || + (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) { + tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + avg_sad += tmp_sad; num_samples++; - avg_sad += cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, - last_src_ystride); } src_y += 64; last_src_y += 64; @@ -2247,6 +2381,8 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { rc->high_source_sad = 1; else rc->high_source_sad = 0; + if (rc->high_source_sad && avg_sad > thresh_key) + scene_cut_force_key_frame = 1; if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR) rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2; } else { @@ -2254,6 +2390,23 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { } } } + // For CBR non-screen content mode, check if we should reset the rate + // control. Reset is done if high_source_sad is detected and the rate + // control is at very low QP with rate correction factor at min level. + if (cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) { + if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality && + rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) && + rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) { + rc->rate_correction_factors[INTER_NORMAL] = 0.5; + rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality; + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + rc->reset_high_source_sad = 1; + } + if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad) + rc->this_frame_target = rc->avg_frame_bandwidth; + } // For VBR, under scene change/high content change, force golden refresh. if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME && rc->high_source_sad && rc->frames_to_key > 3 && @@ -2261,6 +2414,10 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { cpi->ext_refresh_frame_flags_pending == 0) { int target; cpi->refresh_golden_frame = 1; + if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME; + rc->source_alt_ref_pending = 0; + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) + rc->source_alt_ref_pending = 1; rc->gfu_boost = DEFAULT_GF_BOOST >> 1; rc->baseline_gf_interval = VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval)); diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.h b/libs/libvpx/vp9/encoder/vp9_ratectrl.h index 6006e9b051..c1b210677e 100644 --- a/libs/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.h @@ -21,6 +21,9 @@ extern "C" { #endif +// Used to control aggressive VBR mode. +// #define AGGRESSIVE_VBR 1 + // Bits Per MB at different Q (Multiplied by 512) #define BPER_MB_NORMBITS 9 @@ -29,6 +32,8 @@ extern "C" { #define FIXED_GF_INTERVAL 8 // Used in some testing modes only #define ONEHALFONLY_RESIZE 0 +#define FRAME_OVERHEAD_BITS 200 + typedef enum { INTER_NORMAL = 0, INTER_HIGH = 1, @@ -147,6 +152,8 @@ typedef struct { int rc_2_frame; int q_1_frame; int q_2_frame; + // Keep track of the last target average frame bandwidth. + int last_avg_frame_bandwidth; // Auto frame-scaling variables. FRAME_SCALE_LEVEL frame_size_selector; @@ -160,11 +167,15 @@ typedef struct { uint64_t avg_source_sad[MAX_LAG_BUFFERS]; uint64_t prev_avg_source_sad_lag; int high_source_sad_lagindex; + int alt_ref_gf_group; + int last_frame_is_src_altref; int high_source_sad; int count_last_scene_change; int avg_frame_low_motion; int af_ratio_onepass_vbr; int force_qpmin; + int reset_high_source_sad; + double perc_arf_usage; } RATE_CONTROL; struct VP9_COMP; @@ -178,6 +189,8 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth); +int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth); + void vp9_rc_init_minq_luts(void); int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate); @@ -277,7 +290,7 @@ void vp9_set_target_rate(struct VP9_COMP *cpi); int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi); -void vp9_avg_source_sad(struct VP9_COMP *cpi); +void vp9_scene_detection_onepass(struct VP9_COMP *cpi); int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q); diff --git a/libs/libvpx/vp9/encoder/vp9_rd.c b/libs/libvpx/vp9/encoder/vp9_rd.c index 76c639a647..6b2306ce9b 100644 --- a/libs/libvpx/vp9/encoder/vp9_rd.c +++ b/libs/libvpx/vp9/encoder/vp9_rd.c @@ -146,7 +146,7 @@ static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, 128, 144 }; -int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { +int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); #if CONFIG_VP9_HIGHBITDEPTH int64_t rdmult = 0; @@ -161,6 +161,12 @@ int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { #else int64_t rdmult = 88 * q * q / 24; #endif // CONFIG_VP9_HIGHBITDEPTH + return rdmult; +} + +int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { + int64_t rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); + if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; @@ -306,63 +312,62 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { } } +// NOTE: The tables below must be of the same size. + +// The functions described below are sampled at the four most significant +// bits of x^2 + 8 / 256. + +// Normalized rate: +// This table models the rate for a Laplacian source with given variance +// when quantized with a uniform quantizer with given stepsize. The +// closed form expression is: +// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], +// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), +// and H(x) is the binary entropy function. +static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044, + 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037, + 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179, + 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398, + 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, 911, 864, 821, 781, 745, + 680, 623, 574, 530, 490, 455, 424, 395, 345, 304, 269, 239, 213, + 190, 171, 154, 126, 104, 87, 73, 61, 52, 44, 38, 28, 21, + 16, 12, 10, 8, 6, 5, 3, 2, 1, 1, 1, 0, 0, +}; + +// Normalized distortion: +// This table models the normalized distortion for a Laplacian source +// with given variance when quantized with a uniform quantizer +// with given stepsize. The closed form expression is: +// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) +// where x = qpstep / sqrt(variance). +// Note the actual distortion is Dn * variance. +static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, + 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 21, + 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, 59, 64, 69, + 73, 78, 88, 97, 106, 115, 124, 133, 142, 151, 167, 184, 200, + 215, 231, 245, 260, 274, 301, 327, 351, 375, 397, 418, 439, 458, + 495, 528, 559, 587, 613, 637, 659, 680, 717, 749, 777, 801, 823, + 842, 859, 874, 899, 919, 936, 949, 960, 969, 977, 983, 994, 1001, + 1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, +}; +static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 72, 80, 88, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 256, 288, + 320, 352, 384, 416, 448, 480, 544, 608, 672, + 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, + 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, + 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, + 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, + 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, + 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, + 180192, 196576, 212960, 229344, 245728, +}; + static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { - // NOTE: The tables below must be of the same size. - - // The functions described below are sampled at the four most significant - // bits of x^2 + 8 / 256. - - // Normalized rate: - // This table models the rate for a Laplacian source with given variance - // when quantized with a uniform quantizer with given stepsize. The - // closed form expression is: - // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], - // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), - // and H(x) is the binary entropy function. - static const int rate_tab_q10[] = { - 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, - 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, - 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, - 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, - 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, - 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424, - 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87, - 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6, - 5, 3, 2, 1, 1, 1, 0, 0, - }; - - // Normalized distortion: - // This table models the normalized distortion for a Laplacian source - // with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expression is: - // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) - // where x = qpstep / sqrt(variance). - // Note the actual distortion is Dn * variance. - static const int dist_tab_q10[] = { - 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, - 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, - 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, - 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142, - 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351, - 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659, - 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936, - 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, - 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, - }; - static const int xsq_iq_q10[] = { - 0, 4, 8, 12, 16, 20, 24, 28, 32, - 40, 48, 56, 64, 72, 80, 88, 96, 112, - 128, 144, 160, 176, 192, 208, 224, 256, 288, - 320, 352, 384, 416, 448, 480, 544, 608, 672, - 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, - 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, - 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, - 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, - 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, - 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, - 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, - 180192, 196576, 212960, 229344, 245728, - }; const int tmp = (xsq_q10 >> 2) + 8; const int k = get_msb(tmp) - 3; const int xq = (k << 3) + ((tmp >> k) & 0x7); @@ -373,6 +378,24 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; } +static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE], + int r_q10[MAX_MB_PLANE], + int d_q10[MAX_MB_PLANE]) { + int i; + const int one_q10 = 1 << 10; + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int tmp = (xsq_q10[i] >> 2) + 8; + const int k = get_msb(tmp) - 3; + const int xq = (k << 3) + ((tmp >> k) & 0x7); + const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k); + const int b_q10 = one_q10 - a_q10; + r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; + d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; + } +} + +static const uint32_t MAX_XSQ_Q10 = 245727; + void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist) { @@ -387,7 +410,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, *dist = 0; } else { int d_q10, r_q10; - static const uint32_t MAX_XSQ_Q10 = 245727; const uint64_t xsq_q10_64 = (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10); @@ -397,6 +419,30 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, } } +// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where +// vectors are of length MAX_MB_PLANE and all elements of var are non-zero. +void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], + unsigned int n_log2[MAX_MB_PLANE], + unsigned int qstep[MAX_MB_PLANE], + int64_t *rate_sum, int64_t *dist_sum) { + int i; + int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE]; + for (i = 0; i < MAX_MB_PLANE; ++i) { + const uint64_t xsq_q10_64 = + (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) / + var[i]; + xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10); + } + model_rd_norm_vec(xsq_q10, r_q10, d_q10); + for (i = 0; i < MAX_MB_PLANE; ++i) { + int rate = + ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT); + int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10; + *rate_sum += rate; + *dist_sum += dist; + } +} + void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[16], @@ -624,19 +670,21 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, } } -int vp9_get_intra_cost_penalty(int qindex, int qdelta, - vpx_bit_depth_t bit_depth) { - const int q = vp9_dc_quant(qindex, qdelta, bit_depth); -#if CONFIG_VP9_HIGHBITDEPTH - switch (bit_depth) { - case VPX_BITS_8: return 20 * q; - case VPX_BITS_10: return 5 * q; - case VPX_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2); - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; - } -#else - return 20 * q; -#endif // CONFIG_VP9_HIGHBITDEPTH +int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize, + int qindex, int qdelta) { + // Reduce the intra cost penalty for small blocks (<=16x16). + int reduction_fac = + (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; + + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh) + // Don't reduce intra cost penalty if estimated noise level is high. + reduction_fac = 0; + + // Always use VPX_BITS_8 as input here because the penalty is applied + // to rate not distortion so we want a consistent penalty for all bit + // depths. If the actual bit depth were passed in here then the value + // retured by vp9_dc_quant() would scale with the bit depth and we would + // then need to apply inverse scaling to correct back to a bit depth + // independent rate penalty. + return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac; } diff --git a/libs/libvpx/vp9/encoder/vp9_rd.h b/libs/libvpx/vp9/encoder/vp9_rd.h index 05344b6cf0..59022c106e 100644 --- a/libs/libvpx/vp9/encoder/vp9_rd.h +++ b/libs/libvpx/vp9/encoder/vp9_rd.h @@ -38,6 +38,7 @@ extern "C" { #define MAX_MODES 30 #define MAX_REFS 6 +#define RD_THRESH_INIT_FACT 32 #define RD_THRESH_MAX_FACT 64 #define RD_THRESH_INC 1 @@ -128,6 +129,9 @@ struct TileDataEnc; struct VP9_COMP; struct macroblock; +int64_t vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, + int qindex); + int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex); void vp9_initialize_rd_consts(struct VP9_COMP *cpi); @@ -137,6 +141,11 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, unsigned int qstep, int *rate, int64_t *dist); +void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], + unsigned int n_log2[MAX_MB_PLANE], + unsigned int qstep[MAX_MB_PLANE], + int64_t *rate_sum, int64_t *dist_sum); + int vp9_get_switchable_rate(const struct VP9_COMP *cpi, const MACROBLOCKD *const xd); @@ -164,8 +173,8 @@ void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize, int best_mode_index); static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, - int thresh_fact) { - return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; + const int *const thresh_fact) { + return best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; } static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) { @@ -182,13 +191,18 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd, const struct scale_factors *scale, const struct scale_factors *scale_uv); -int vp9_get_intra_cost_penalty(int qindex, int qdelta, - vpx_bit_depth_t bit_depth); +int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi, + BLOCK_SIZE bsize, int qindex, int qdelta); +unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs); unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs); #if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs, + int bd); unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd); diff --git a/libs/libvpx/vp9/encoder/vp9_rdopt.c b/libs/libvpx/vp9/encoder/vp9_rdopt.c index 27d4e9d6d3..2ba6378c5e 100644 --- a/libs/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libs/libvpx/vp9/encoder/vp9_rdopt.c @@ -164,17 +164,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, const int ref = xd->mi[0]->ref_frame[0]; unsigned int sse; unsigned int var = 0; - unsigned int sum_sse = 0; int64_t total_sse = 0; int skip_flag = 1; const int shift = 6; - int rate; int64_t dist; const int dequant_shift = #if CONFIG_VP9_HIGHBITDEPTH (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : #endif // CONFIG_VP9_HIGHBITDEPTH 3; + unsigned int qstep_vec[MAX_MB_PLANE]; + unsigned int nlog2_vec[MAX_MB_PLANE]; + unsigned int sum_sse_vec[MAX_MB_PLANE]; + int any_zero_sum_sse = 0; x->pred_sse[ref] = 0; @@ -186,6 +188,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size]; const int64_t dc_thr = p->quant_thred[0] >> shift; const int64_t ac_thr = p->quant_thred[1] >> shift; + unsigned int sum_sse = 0; // The low thresholds are used to measure if the prediction errors are // low enough so that we can skip the mode search. const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2); @@ -196,8 +199,6 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int lw = b_width_log2_lookup[unit_size] + 2; int lh = b_height_log2_lookup[unit_size] + 2; - sum_sse = 0; - for (idy = 0; idy < bh; ++idy) { for (idx = 0; idx < bw; ++idx) { uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw); @@ -233,12 +234,18 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } total_sse += sum_sse; + sum_sse_vec[i] = sum_sse; + any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0); + qstep_vec[i] = pd->dequant[1] >> dequant_shift; + nlog2_vec[i] = num_pels_log2_lookup[bs]; + } - // Fast approximate the modelling function. - if (cpi->sf.simple_model_rd_from_var) { + // Fast approximate the modelling function. + if (cpi->sf.simple_model_rd_from_var) { + for (i = 0; i < MAX_MB_PLANE; ++i) { int64_t rate; - const int64_t square_error = sum_sse; - int quantizer = (pd->dequant[1] >> dequant_shift); + const int64_t square_error = sum_sse_vec[i]; + int quantizer = qstep_vec[i]; if (quantizer < 120) rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT); @@ -247,12 +254,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, dist = (square_error * quantizer) >> 8; rate_sum += rate; dist_sum += dist; + } + } else { + if (any_zero_sum_sse) { + for (i = 0; i < MAX_MB_PLANE; ++i) { + int rate; + vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i], + &rate, &dist); + rate_sum += rate; + dist_sum += dist; + } } else { - vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs], - pd->dequant[1] >> dequant_shift, &rate, - &dist); - rate_sum += rate; - dist_sum += dist; + vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec, + &rate_sum, &dist_sum); } } @@ -284,22 +298,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, return error; } -int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz) { - // Note that the C versions of these 2 functions (vp9_block_error and - // vp9_highbd_block_error_8bit are the same, but the optimized assembly - // routines are not compatible in the non high bitdepth configuration, so - // they still cannot share the same name. - return vp9_block_error_c(coeff, dqcoeff, block_size, ssz); -} - static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd) { if (bd == 8) { - return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz); + return vp9_block_error(coeff, dqcoeff, block_size, ssz); } else { return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd); } @@ -321,7 +325,7 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, return error; } -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size) { int i; int64_t error = 0; @@ -358,11 +362,11 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = x->token_costs[tx_size][type][is_inter_block(mi)]; uint8_t token_cache[32 * 32]; - int c, cost; + int cost; #if CONFIG_VP9_HIGHBITDEPTH - const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd); + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); #else - const int *cat6_high_cost = vp9_get_high_cost_table(8); + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); #endif // Check for consistency of tx_size with mode info @@ -373,10 +377,10 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, if (eob == 0) { // single eob token cost = token_costs[0][0][pt][EOB_TOKEN]; - c = 0; } else { if (use_fast_coef_costing) { int band_left = *band_count++; + int c; // dc token int v = qcoeff[0]; @@ -407,6 +411,7 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, } else { // !use_fast_coef_costing int band_left = *band_count++; + int c; // dc token int v = qcoeff[0]; @@ -510,7 +515,8 @@ static int64_t sum_squares_visible(const MACROBLOCKD *xd, pd->subsampling_y, blk_row); if (tx_bsize == BLOCK_4X4 || (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { - sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_bsize); + assert(tx_4x4_w == tx_4x4_h); + sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2); } else { int r, c; int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); @@ -520,7 +526,8 @@ static int64_t sum_squares_visible(const MACROBLOCKD *xd, for (r = 0; r < max_r; ++r) { // Skip visiting the sub blocks that are wholly within the UMV. for (c = 0; c < max_c; ++c) { - sse += (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, BLOCK_4X4); + sse += (int64_t)vpx_sum_squares_2d_i16( + diff + r * diff_stride * 4 + c * 4, diff_stride, 4); } } } @@ -592,31 +599,31 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - recon = CONVERT_TO_BYTEPTR(recon); - vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, - bs, bs, xd->bd); + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, + 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } } + recon = CONVERT_TO_BYTEPTR(recon16); } else { #endif // CONFIG_VP9_HIGHBITDEPTH - vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs); + vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs); switch (tx_size) { case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break; case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break; @@ -625,7 +632,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, // this is like vp9_short_idct4x4 but has a special case around // eob<=1, which is significant (not just an optimization) for // the lossless case. - x->itxm_add(dqcoeff, recon, 32, *eob); + x->inv_txfm_add(dqcoeff, recon, 32, *eob); break; default: assert(0 && "Invalid transform size"); break; } @@ -723,7 +730,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, } } else { // SKIP_TXFM_AC_DC - // skip forward transform + // skip forward transform. Because this is handled here, the quantization + // does not need to do it. x->plane[plane].eobs[block] = 0; sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; dist = sse; @@ -751,9 +759,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, // TODO(jingning): temporarily enabled only for luma component rd = VPXMIN(rd1, rd2); - if (plane == 0) + if (plane == 0) { x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless); + x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block]; + } args->this_rate += rate; args->this_dist += dist; @@ -995,6 +1005,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int block = (row + idy) * 2 + (col + idx); const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); @@ -1016,7 +1027,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next_highbd; - vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, + vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16, dst_stride, p->eobs[block], xd->bd); } else { int64_t unused; @@ -1039,7 +1050,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next_highbd; vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), - dst, dst_stride, p->eobs[block], xd->bd); + dst16, dst_stride, p->eobs[block], xd->bd); } } } @@ -1129,16 +1140,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; -#if CONFIG_VP9_HIGHBITDEPTH - distortion += - vp9_highbd_block_error_8bit( - coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> - 2; -#else distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2; -#endif if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next; vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst, @@ -1526,7 +1530,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_build_inter_predictor( - pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, + CONVERT_TO_SHORTPTR(pre), y_stride, CONVERT_TO_SHORTPTR(dst), + pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2), xd->bd); @@ -1572,8 +1577,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, k += (idy * 2 + idx); coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]); coeff = BLOCK_OFFSET(p->coeff, k); - x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), - coeff, 8); + x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), + coeff, 8); vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan); #if CONFIG_VP9_HIGHBITDEPTH thisdistortion += vp9_highbd_block_error_dispatch( @@ -1781,9 +1786,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); vp9_highbd_build_inter_predictor( - ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, - &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, kernel, MV_PRECISION_Q3, - mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); + CONVERT_TO_SHORTPTR(ref_yv12[!id].buf), ref_yv12[!id].stride, + second_pred_alloc_16, pw, &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, + kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); } else { second_pred = (uint8_t *)second_pred_alloc_16; vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, @@ -1998,7 +2003,8 @@ static int64_t rd_pick_best_sub8x8_mode( vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv); bestsme = vp9_full_pixel_search( - cpi, x, bsize, &mvp_full, step_param, sadpb, + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, + sadpb, sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL, &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1); @@ -2403,9 +2409,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, mvp_full.col >>= 3; mvp_full.row >>= 3; - bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), &ref_mv, - &tmp_mv->as_mv, INT_MAX, 1); + bestsme = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); x->mv_limits = tmp_mv_limits; @@ -2870,57 +2876,82 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, // This function is designed to apply a bias or adjustment to an rd value based // on the relative variance of the source and reconstruction. -#define LOW_VAR_THRESH 16 -#define VLOW_ADJ_MAX 25 -#define VHIGH_ADJ_MAX 8 +#define VERY_LOW_VAR_THRESH 2 +#define LOW_VAR_THRESH 5 +#define VAR_MULT 100 +static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 100 }; + static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *this_rd, MV_REFERENCE_FRAME ref_frame, unsigned int source_variance) { MACROBLOCKD *const xd = &x->e_mbd; - unsigned int recon_variance; + unsigned int rec_variance; + unsigned int src_variance; + unsigned int src_rec_min; unsigned int absvar_diff = 0; - int64_t var_error = 0; - int64_t var_factor = 0; + unsigned int var_factor = 0; + unsigned int adj_max; + vp9e_tune_content content_type = cpi->oxcf.content; if (*this_rd == INT64_MAX) return; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - recon_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, + if (source_variance > 0) { + rec_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd); + src_variance = source_variance; + } else { + rec_variance = + vp9_high_get_sby_variance(cpi, &xd->plane[0].dst, bsize, xd->bd); + src_variance = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd); + } } else { - recon_variance = - vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + if (source_variance > 0) { + rec_variance = + vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + src_variance = source_variance; + } else { + rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); + } } #else - recon_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + if (source_variance > 0) { + rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + src_variance = source_variance; + } else { + rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); + } #endif // CONFIG_VP9_HIGHBITDEPTH - if ((source_variance + recon_variance) > LOW_VAR_THRESH) { - absvar_diff = (source_variance > recon_variance) - ? (source_variance - recon_variance) - : (recon_variance - source_variance); + // Lower of source (raw per pixel value) and recon variance. Note that + // if the source per pixel is 0 then the recon value here will not be per + // pixel (see above) so will likely be much larger. + src_rec_min = VPXMIN(source_variance, rec_variance); - var_error = ((int64_t)200 * source_variance * recon_variance) / - (((int64_t)source_variance * source_variance) + - ((int64_t)recon_variance * recon_variance)); - var_error = 100 - var_error; - } + if (src_rec_min > LOW_VAR_THRESH) return; + + absvar_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) + : (rec_variance - src_variance); + + adj_max = max_var_adjust[content_type]; + + var_factor = + (unsigned int)((int64_t)VAR_MULT * absvar_diff) / VPXMAX(1, src_variance); + var_factor = VPXMIN(adj_max, var_factor); - // Source variance above a threshold and ref frame is intra. - // This case is targeted mainly at discouraging intra modes that give rise - // to a predictor with a low spatial complexity compared to the source. - if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) && - (source_variance > recon_variance)) { - var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error)); - // A second possible case of interest is where the source variance - // is very low and we wish to discourage false texture or motion trails. - } else if ((source_variance < (LOW_VAR_THRESH >> 1)) && - (recon_variance > source_variance)) { - var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error)); - } *this_rd += (*this_rd * var_factor) / 100; + + if (content_type == VP9E_CONTENT_FILM) { + if (src_rec_min <= VERY_LOW_VAR_THRESH) { + if (ref_frame == INTRA_FRAME) *this_rd *= 2; + if (bsize > 6) *this_rd *= 2; + } + } } // Do we have an internal image edge (e.g. formatting bars). @@ -3032,8 +3063,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; PREDICTION_MODE mode_uv[TX_SIZES]; - const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int best_skip2 = 0; uint8_t ref_frame_skip_mask[2] = { 0 }; uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; @@ -3041,7 +3072,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; int64_t mode_threshold[MAX_MODES]; - int *mode_map = tile_data->mode_map[bsize]; + int *tile_mode_map = tile_data->mode_map[bsize]; + int mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid + // lock mechanism involved with reads from + // tile_mode_map const int mode_search_skip_flags = sf->mode_search_skip_flags; int64_t mask_filter = 0; int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; @@ -3153,23 +3187,28 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; + for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5; midx = sf->schedule_mode_search ? mode_skip_start : 0; + while (midx > 4) { uint8_t end_pos = 0; for (i = 5; i < midx; ++i) { - if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) { - uint8_t tmp = mode_map[i]; - mode_map[i] = mode_map[i - 1]; - mode_map[i - 1] = tmp; + if (mode_threshold[tile_mode_map[i - 1]] > + mode_threshold[tile_mode_map[i]]) { + uint8_t tmp = tile_mode_map[i]; + tile_mode_map[i] = tile_mode_map[i - 1]; + tile_mode_map[i - 1] = tmp; end_pos = i; } } midx = end_pos; } + memcpy(mode_map, tile_mode_map, sizeof(mode_map)); + for (midx = 0; midx < MAX_MODES; ++midx) { int mode_index = mode_map[midx]; int mode_excluded = 0; @@ -3187,6 +3226,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, ref_frame = vp9_mode_order[mode_index].ref_frame[0]; second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; + vp9_zero(x->sum_y_eobs); + // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. if (midx == mode_skip_start && best_mode_index >= 0) { @@ -3218,6 +3259,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (best_rd < mode_threshold[mode_index]) continue; + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; + if (sf->motion_field_mode_search) { const int mi_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize], tile_info->mi_col_end - mi_col); @@ -3466,6 +3510,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size], sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk); + ctx->sum_y_eobs = x->sum_y_eobs[mi->tx_size]; // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history @@ -3571,6 +3616,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { + // If adaptive interp filter is enabled, then the current leaf node of 8x8 + // data is needed for sub8x8. Hence preserve the context. + if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; @@ -3689,6 +3737,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data, mi->mv[0].as_int = 0; x->skip = 1; + ctx->sum_y_eobs = 0; + if (cm->interp_filter != BILINEAR) { best_filter = EIGHTTAP; if (cm->interp_filter == SWITCHABLE && @@ -3777,8 +3827,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t dist_uv; int skip_uv; PREDICTION_MODE mode_uv = DC_PRED; - const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; @@ -3787,6 +3837,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; int internal_active_edge = vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi); + const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; memset(x->zcoeff_blk[TX_4X4], 0, 4); @@ -3838,6 +3889,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, ref_frame = vp9_ref_order[ref_index].ref_frame[0]; second_ref_frame = vp9_ref_order[ref_index].ref_frame[1]; + vp9_zero(x->sum_y_eobs); + #if CONFIG_BETTER_HW_COMPATIBILITY // forbid 8X4 and 4X8 partitions if any reference frame is scaled. if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) { @@ -3878,9 +3931,12 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (!internal_active_edge && rd_less_than_thresh(best_rd, rd_opt->threshes[segment_id][bsize][ref_index], - tile_data->thresh_freq_fact[bsize][ref_index])) + &rd_thresh_freq_fact[ref_index])) continue; + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; + comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; @@ -4051,6 +4107,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, for (i = 0; i < 4; i++) { tmp_best_bmodes[i] = xd->mi[0]->bmi[i]; x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i]; + x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i]; } pred_exists = 1; if (switchable_filter_index == 0 && sf->use_rd_breakout && @@ -4215,6 +4272,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4], sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk); + ctx->sum_y_eobs = x->sum_y_eobs[TX_4X4]; for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i]; diff --git a/libs/libvpx/vp9/encoder/vp9_skin_detection.c b/libs/libvpx/vp9/encoder/vp9_skin_detection.c index 3f3d48fb9f..cc6c967767 100644 --- a/libs/libvpx/vp9/encoder/vp9_skin_detection.c +++ b/libs/libvpx/vp9/encoder/vp9_skin_detection.c @@ -15,75 +15,6 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_skin_detection.h" -#define MODEL_MODE 1 - -// Fixed-point skin color model parameters. -static const int skin_mean[5][2] = { { 7463, 9614 }, - { 6400, 10240 }, - { 7040, 10240 }, - { 8320, 9280 }, - { 6800, 9614 } }; -static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 -static const int skin_threshold[6] = { 1570636, 1400000, 800000, - 800000, 800000, 800000 }; // q18 - -// Thresholds on luminance. -static const int y_low = 40; -static const int y_high = 220; - -// Evaluates the Mahalanobis distance measure for the input CbCr values. -static int evaluate_skin_color_difference(int cb, int cr, int idx) { - const int cb_q6 = cb << 6; - const int cr_q6 = cr << 6; - const int cb_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); - const int cbcr_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); - const int cr_diff_q12 = - (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); - const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; - const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; - const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; - const int skin_diff = - skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + - skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; - return skin_diff; -} - -int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr, - int motion) { - if (y < y_low || y > y_high) { - return 0; - } else { - if (MODEL_MODE == 0) { - return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); - } else { - int i = 0; - // Exit on grey. - if (cb == 128 && cr == 128) return 0; - // Exit on very strong cb. - if (cb > 150 && cr < 110) return 0; - for (; i < 5; i++) { - int skin_color_diff = evaluate_skin_color_difference(cb, cr, i); - if (skin_color_diff < skin_threshold[i + 1]) { - if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) - return 0; - else if (motion == 0 && - skin_color_diff > (skin_threshold[i + 1] >> 1)) - return 0; - else - return 1; - } - // Exit if difference is much large than the threshold. - if (skin_color_diff > (skin_threshold[i + 1] << 3)) { - return 0; - } - } - return 0; - } - } -} - int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, int stride, int strideuv, int bsize, int consec_zeromv, int curr_motion_magn) { @@ -100,31 +31,113 @@ int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, const uint8_t ysource = y[y_height_shift * stride + y_width_shift]; const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift]; const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift]; + if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0; - return vp9_skin_pixel(ysource, usource, vsource, motion); + return vpx_skin_pixel(ysource, usource, vsource, motion); } } -#ifdef OUTPUT_YUV_SKINMAP -// For viewing skin map on input source. -void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { - int i, j, mi_row, mi_col, num_bl; +void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + int i, j, num_bl; VP9_COMMON *const cm = &cpi->common; - uint8_t *y; const uint8_t *src_y = cpi->Source->y_buffer; const uint8_t *src_u = cpi->Source->u_buffer; const uint8_t *src_v = cpi->Source->v_buffer; const int src_ystride = cpi->Source->y_stride; const int src_uvstride = cpi->Source->uv_stride; - int y_bsize = 16; // Use 8x8 or 16x16. - int uv_bsize = y_bsize >> 1; - int ypos = y_bsize >> 1; - int uvpos = uv_bsize >> 1; - int shy = (y_bsize == 8) ? 3 : 4; - int shuv = shy - 1; - int fac = y_bsize / 8; - // Use center pixel or average of center 2x2 pixels. - int mode_filter = 0; + const int y_bsize = 4 << b_width_log2_lookup[bsize]; + const int uv_bsize = y_bsize >> 1; + const int shy = (y_bsize == 8) ? 3 : 4; + const int shuv = shy - 1; + const int fac = y_bsize / 8; + const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3); + const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2); + const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2); + const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2); + src_y += y_shift; + src_u += uv_shift; + src_v += uv_shift; + + for (i = mi_row; i < mi_row_limit; i += fac) { + num_bl = 0; + for (j = mi_col; j < mi_col_limit; j += fac) { + int consec_zeromv = 0; + int bl_index = i * cm->mi_cols + j; + int bl_index1 = bl_index + 1; + int bl_index2 = bl_index + cm->mi_cols; + int bl_index3 = bl_index2 + 1; + // Don't detect skin on the boundary. + if (i == 0 || j == 0) continue; + if (bsize == BLOCK_8X8) + consec_zeromv = cpi->consec_zero_mv[bl_index]; + else + consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], + VPXMIN(cpi->consec_zero_mv[bl_index1], + VPXMIN(cpi->consec_zero_mv[bl_index2], + cpi->consec_zero_mv[bl_index3]))); + cpi->skin_map[bl_index] = + vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride, + bsize, consec_zeromv, 0); + num_bl++; + src_y += y_bsize; + src_u += uv_bsize; + src_v += uv_bsize; + } + src_y += (src_ystride << shy) - (num_bl << shy); + src_u += (src_uvstride << shuv) - (num_bl << shuv); + src_v += (src_uvstride << shuv) - (num_bl << shuv); + } + + // Remove isolated skin blocks (none of its neighbors are skin) and isolated + // non-skin blocks (all of its neighbors are skin). + // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin + // blocks. Skip superblock borders to remove isolated non-skin blocks. + for (i = mi_row; i < mi_row_limit; i += fac) { + for (j = mi_col; j < mi_col_limit; j += fac) { + int bl_index = i * cm->mi_cols + j; + int num_neighbor = 0; + int mi, mj; + int non_skin_threshold = 8; + // Skip 4 corners. + if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) || + (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac))) + continue; + // There are only 5 neighbors for non-skin blocks on the border. + if (i == mi_row || i == mi_row_limit - fac || j == mi_col || + j == mi_col_limit - fac) + non_skin_threshold = 5; + + for (mi = -fac; mi <= fac; mi += fac) { + for (mj = -fac; mj <= fac; mj += fac) { + if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col && + j + mj < mi_col_limit) { + int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj; + if (cpi->skin_map[bl_neighbor_index]) num_neighbor++; + } + } + } + + if (cpi->skin_map[bl_index] && num_neighbor < 2) + cpi->skin_map[bl_index] = 0; + if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold) + cpi->skin_map[bl_index] = 1; + } + } +} + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { + int i, j, mi_row, mi_col, num_bl; + VP9_COMMON *const cm = &cpi->common; + uint8_t *y; + const uint8_t *src_y = cpi->Source->y_buffer; + const int src_ystride = cpi->Source->y_stride; + const int y_bsize = 16; // Use 8x8 or 16x16. + const int shy = (y_bsize == 8) ? 3 : 4; + const int fac = y_bsize / 8; + YV12_BUFFER_CONFIG skinmap; memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG)); if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x, @@ -141,65 +154,21 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) { num_bl = 0; for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) { - int is_skin = 0; - if (mode_filter == 1) { - // Use 2x2 average at center. - uint8_t ysource = src_y[ypos * src_ystride + ypos]; - uint8_t usource = src_u[uvpos * src_uvstride + uvpos]; - uint8_t vsource = src_v[uvpos * src_uvstride + uvpos]; - uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos]; - uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos]; - uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos]; - uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)]; - uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)]; - uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)]; - uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)]; - uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)]; - uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)]; - ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2; - usource = (usource + usource2 + usource3 + usource4) >> 2; - vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2; - is_skin = vp9_skin_pixel(ysource, usource, vsource, 1); - } else { - int block_size = BLOCK_8X8; - int consec_zeromv = 0; - int bl_index = mi_row * cm->mi_cols + mi_col; - int bl_index1 = bl_index + 1; - int bl_index2 = bl_index + cm->mi_cols; - int bl_index3 = bl_index2 + 1; - if (y_bsize == 8) - consec_zeromv = cpi->consec_zero_mv[bl_index]; - else - consec_zeromv = - VPXMIN(cpi->consec_zero_mv[bl_index], - VPXMIN(cpi->consec_zero_mv[bl_index1], - VPXMIN(cpi->consec_zero_mv[bl_index2], - cpi->consec_zero_mv[bl_index3]))); - if (y_bsize == 16) block_size = BLOCK_16X16; - is_skin = - vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, - src_uvstride, block_size, consec_zeromv, 0); - } + const int block_index = mi_row * cm->mi_cols + mi_col; + const int is_skin = cpi->skin_map[block_index]; for (i = 0; i < y_bsize; i++) { for (j = 0; j < y_bsize; j++) { - if (is_skin) - y[i * src_ystride + j] = 255; - else - y[i * src_ystride + j] = src_y[i * src_ystride + j]; + y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j]; } } num_bl++; y += y_bsize; src_y += y_bsize; - src_u += uv_bsize; - src_v += uv_bsize; } y += (src_ystride << shy) - (num_bl << shy); src_y += (src_ystride << shy) - (num_bl << shy); - src_u += (src_uvstride << shuv) - (num_bl << shuv); - src_v += (src_uvstride << shuv) - (num_bl << shuv); } - vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file); + vpx_write_yuv_frame(yuv_skinmap_file, &skinmap); vpx_free_frame_buffer(&skinmap); } #endif diff --git a/libs/libvpx/vp9/encoder/vp9_skin_detection.h b/libs/libvpx/vp9/encoder/vp9_skin_detection.h index c77382dbd7..8880bff466 100644 --- a/libs/libvpx/vp9/encoder/vp9_skin_detection.h +++ b/libs/libvpx/vp9/encoder/vp9_skin_detection.h @@ -12,6 +12,8 @@ #define VP9_ENCODER_VP9_SKIN_MAP_H_ #include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/skin_detection.h" +#include "vpx_util/vpx_write_yuv_frame.h" #ifdef __cplusplus extern "C" { @@ -19,19 +21,16 @@ extern "C" { struct VP9_COMP; -// #define OUTPUT_YUV_SKINMAP - -int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr, - int motion); - int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, int stride, int strideuv, int bsize, int consec_zeromv, int curr_motion_magn); +void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col); + #ifdef OUTPUT_YUV_SKINMAP // For viewing skin map on input source. -void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); -extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f); +void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); #endif #ifdef __cplusplus diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.c b/libs/libvpx/vp9/encoder/vp9_speed_features.c index c1cf48fa7b..a05db60c65 100644 --- a/libs/libvpx/vp9/encoder/vp9_speed_features.c +++ b/libs/libvpx/vp9/encoder/vp9_speed_features.c @@ -20,19 +20,14 @@ static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } }; -#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +// Define 3 mesh density levels to control the number of searches. +#define MESH_DENSITY_LEVELS 3 static MESH_PATTERN - good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { - { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + good_quality_mesh_patterns[MESH_DENSITY_LEVELS][MAX_MESH_STEP] = { { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, - { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, - { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, }; -static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = { - 50, 25, 15, 5, 1, 1 -}; // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality @@ -67,14 +62,26 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, int speed) { VP9_COMMON *const cm = &cpi->common; + // speed 0 features + sf->partition_search_breakout_thr.dist = (1 << 20); + sf->partition_search_breakout_thr.rate = 80; + + // Currently, the machine-learning based partition search early termination + // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. + if (VPXMIN(cm->width, cm->height) >= 480) { + sf->ml_partition_search_early_termination = 1; + } + if (speed >= 1) { + sf->ml_partition_search_early_termination = 0; + if (VPXMIN(cm->width, cm->height) >= 720) { sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; - sf->partition_search_breakout_dist_thr = (1 << 23); + sf->partition_search_breakout_thr.dist = (1 << 23); } else { sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; - sf->partition_search_breakout_dist_thr = (1 << 21); + sf->partition_search_breakout_thr.dist = (1 << 21); } } @@ -83,12 +90,12 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; sf->adaptive_pred_interp_filter = 0; - sf->partition_search_breakout_dist_thr = (1 << 24); - sf->partition_search_breakout_rate_thr = 120; + sf->partition_search_breakout_thr.dist = (1 << 24); + sf->partition_search_breakout_thr.rate = 120; } else { sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; - sf->partition_search_breakout_dist_thr = (1 << 22); - sf->partition_search_breakout_rate_thr = 100; + sf->partition_search_breakout_thr.dist = (1 << 22); + sf->partition_search_breakout_thr.rate = 100; } sf->rd_auto_partition_min_limit = set_partition_min_limit(cm); @@ -108,14 +115,14 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, if (VPXMIN(cm->width, cm->height) >= 720) { sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; - sf->partition_search_breakout_dist_thr = (1 << 25); - sf->partition_search_breakout_rate_thr = 200; + sf->partition_search_breakout_thr.dist = (1 << 25); + sf->partition_search_breakout_thr.rate = 200; } else { sf->max_intra_bsize = BLOCK_32X32; sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0; - sf->partition_search_breakout_dist_thr = (1 << 23); - sf->partition_search_breakout_rate_thr = 120; + sf->partition_search_breakout_thr.dist = (1 << 23); + sf->partition_search_breakout_thr.rate = 120; } } @@ -129,33 +136,54 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } if (speed >= 4) { + sf->partition_search_breakout_thr.rate = 300; if (VPXMIN(cm->width, cm->height) >= 720) { - sf->partition_search_breakout_dist_thr = (1 << 26); + sf->partition_search_breakout_thr.dist = (1 << 26); } else { - sf->partition_search_breakout_dist_thr = (1 << 24); + sf->partition_search_breakout_thr.dist = (1 << 24); } sf->disable_split_mask = DISABLE_ALL_SPLIT; } + + if (speed >= 5) { + sf->partition_search_breakout_thr.rate = 500; + } } static double tx_dom_thresholds[6] = { 99.0, 14.0, 12.0, 8.0, 4.0, 0.0 }; static double qopt_thresholds[6] = { 99.0, 12.0, 10.0, 4.0, 2.0, 0.0 }; -static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, - SPEED_FEATURES *sf, int speed) { +static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, + VP9_COMMON *cm, + SPEED_FEATURES *sf, + int speed) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; const int boosted = frame_is_boosted(cpi); + int i; - sf->partition_search_breakout_dist_thr = (1 << 20); - sf->partition_search_breakout_rate_thr = 80; sf->tx_size_search_breakout = 1; sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh_row_mt = 0; sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; sf->use_square_partition_only = !frame_is_boosted(cpi); sf->use_square_only_threshold = BLOCK_16X16; + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + sf->exhaustive_searches_thresh = (1 << 22); + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 0; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } else { + sf->exhaustive_searches_thresh = INT_MAX; + } + if (speed >= 1) { - if (cpi->oxcf.pass == 2) { + if (oxcf->pass == 2) { TWO_PASS *const twopass = &cpi->twopass; if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) || vp9_internal_image_edge(cpi)) { @@ -182,6 +210,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->mv.subpel_iters_per_step = 1; sf->mode_skip_start = 10; sf->adaptive_pred_interp_filter = 1; + sf->allow_acl = 0; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; @@ -190,27 +219,44 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 30; + + sf->exhaustive_searches_thresh = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23) + : INT_MAX; } if (speed >= 2) { - sf->recode_loop = ALLOW_RECODE_KFARFGF; + if (oxcf->vbr_corpus_complexity) + sf->recode_loop = ALLOW_RECODE_FIRST; + else + sf->recode_loop = ALLOW_RECODE_KFARFGF; + sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; // Reference masking is not supported in dynamic scaling mode. - sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0; + sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0; sf->mode_search_skip_flags = - (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH | - FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_INTRA_LOWVAR; + (cm->frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; sf->disable_filter_search_var_thresh = 100; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; - sf->allow_partition_search_skip = 1; sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 45; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 1; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } } if (speed >= 3) { @@ -229,6 +275,17 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; sf->adaptive_interp_filter_search = 1; + sf->allow_partition_search_skip = 1; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 2; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } } if (speed >= 4) { @@ -244,7 +301,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->use_fast_coef_updates = ONE_LOOP_REDUCED; sf->use_fast_coef_costing = 1; sf->motion_field_mode_search = !boosted; - sf->partition_search_breakout_rate_thr = 300; } if (speed >= 5) { @@ -256,7 +312,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->intra_y_mode_mask[i] = INTRA_DC; sf->intra_uv_mode_mask[i] = INTRA_DC; } - sf->partition_search_breakout_rate_thr = 500; sf->mv.reduce_first_step_size = 1; sf->simple_model_rd_from_var = 1; } @@ -286,10 +341,11 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, } if (speed >= 5) { + sf->partition_search_breakout_thr.rate = 200; if (VPXMIN(cm->width, cm->height) >= 720) { - sf->partition_search_breakout_dist_thr = (1 << 25); + sf->partition_search_breakout_thr.dist = (1 << 25); } else { - sf->partition_search_breakout_dist_thr = (1 << 23); + sf->partition_search_breakout_thr.dist = (1 << 23); } } @@ -299,16 +355,25 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, } } -static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, - vp9e_tune_content content) { +static void set_rt_speed_feature_framesize_independent( + VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) { VP9_COMMON *const cm = &cpi->common; const int is_keyframe = cm->frame_type == KEY_FRAME; const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->static_segmentation = 0; sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh_row_mt = 0; sf->use_fast_coef_costing = 1; - sf->allow_exhaustive_searches = 0; sf->exhaustive_searches_thresh = INT_MAX; + sf->allow_acl = 0; + sf->copy_partition_flag = 0; + sf->use_source_sad = 0; + sf->use_simple_block_yrd = 0; + sf->adapt_partition_source_sad = 0; + sf->use_altref_onepass = 0; + sf->use_compound_nonrd_pickmode = 0; + sf->nonrd_keyframe = 0; + sf->svc_use_lowres_part = 0; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; @@ -333,10 +398,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, if (speed >= 2) { sf->mode_search_skip_flags = - (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH | - FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_INTRA_LOWVAR; + (cm->frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; sf->adaptive_pred_interp_filter = 2; // Reference masking only enabled for 1 spatial layer, and if none of the @@ -386,6 +451,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, if (speed >= 4) { int i; + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) + sf->use_altref_onepass = 1; sf->last_partitioning_redo_frequency = 4; sf->adaptive_rd_thresh = 5; sf->use_fast_coef_costing = 0; @@ -411,6 +478,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, } if (speed >= 5) { + sf->use_altref_onepass = 0; sf->use_quant_fp = !is_keyframe; sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX; @@ -421,6 +489,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); sf->max_delta_qindex = is_keyframe ? 20 : 15; sf->partition_search_type = REFERENCE_PARTITION; + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cpi->rc.is_src_frame_alt_ref) { + sf->partition_search_type = VAR_BASED_PARTITION; + } sf->use_nonrd_pick_mode = 1; sf->allow_skip_recode = 0; sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO; @@ -430,7 +502,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, sf->adaptive_rd_thresh = 2; // This feature is only enabled when partition search is disabled. sf->reuse_inter_pred_sby = 1; - sf->partition_search_breakout_rate_thr = 200; sf->coeff_prob_appx_step = 4; sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED; sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH; @@ -456,42 +527,103 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, sf->short_circuit_flat_blocks = 1; } if (cpi->oxcf.rc_mode == VPX_CBR && - cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) { + cpi->oxcf.content != VP9E_CONTENT_SCREEN) { sf->limit_newmv_early_exit = 1; - sf->bias_golden = 1; + if (!cpi->use_svc) sf->bias_golden = 1; } } if (speed >= 6) { + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } sf->partition_search_type = VAR_BASED_PARTITION; // Turn on this to use non-RD key frame coding mode. sf->use_nonrd_pick_mode = 1; sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; - if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && - content != VP9E_CONTENT_SCREEN) { + + if (!cpi->external_resize) sf->use_source_sad = 1; + + if (sf->use_source_sad) { + sf->adapt_partition_source_sad = 1; + sf->adapt_partition_thresh = + (cm->width * cm->height <= 640 * 360) ? 40000 : 60000; + if (cpi->content_state_sb_fd == NULL && + (!cpi->use_svc || + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + cpi->content_state_sb_fd = (uint8_t *)vpx_calloc( + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t)); + } + } + if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) { // Enable short circuit for low temporal variance. sf->short_circuit_low_temp_var = 1; } - if (cpi->use_svc) sf->base_mv_aggressive = 1; + if (cpi->svc.temporal_layer_id > 0) { + sf->adaptive_rd_thresh = 4; + sf->limit_newmv_early_exit = 0; + sf->base_mv_aggressive = 1; + } } if (speed >= 7) { + sf->adapt_partition_source_sad = 0; sf->adaptive_rd_thresh = 3; sf->mv.search_method = FAST_DIAMOND; sf->mv.fullpel_search_step_param = 10; + // For SVC: use better mv search on base temporal layer, and only + // on base spatial layer if highest resolution is above 640x360. if (cpi->svc.number_temporal_layers > 2 && - cpi->svc.temporal_layer_id == 0) { + cpi->svc.temporal_layer_id == 0 && + (cpi->svc.spatial_layer_id == 0 || + cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) { sf->mv.search_method = NSTEP; sf->mv.fullpel_search_step_param = 6; } + if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) { + sf->use_simple_block_yrd = 1; + if (cpi->svc.non_reference_frame) + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE; + } + if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; + // Enable partition copy. For SVC only enabled for top spatial resolution + // layer. + cpi->max_copied_frame = 0; + if (!cpi->last_frame_dropped && cpi->resize_state == ORIG && + !cpi->external_resize && + (!cpi->use_svc || + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + sf->copy_partition_flag = 1; + cpi->max_copied_frame = 2; + // The top temporal enhancement layer (for number of temporal layers > 1) + // are non-reference frames, so use large/max value for max_copied_frame. + if (cpi->svc.number_temporal_layers > 1 && + cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1) + cpi->max_copied_frame = 255; + } + // For SVC: enable use of lower resolution partition for higher resolution, + // only for 3 spatial layers and when config/top resolution is above VGA. + // Enable only for non-base temporal layer frames. + if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 && + cpi->svc.temporal_layer_id > 0 && + cpi->oxcf.width * cpi->oxcf.height > 640 * 480) + sf->svc_use_lowres_part = 1; } if (speed >= 8) { sf->adaptive_rd_thresh = 4; - sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2; - sf->lpf_pick = LPF_PICK_MINIMAL_LPF; + sf->skip_encode_sb = 1; + sf->nonrd_keyframe = 1; + if (!cpi->use_svc) cpi->max_copied_frame = 4; + if (cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; + + if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3; + if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF; // Only keep INTRA_DC mode for speed 8. if (!is_keyframe) { int i = 0; @@ -501,10 +633,38 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) { // More aggressive short circuit for speed 8. - sf->short_circuit_low_temp_var = 2; + sf->short_circuit_low_temp_var = 3; + // Use level 2 for noisey cases as there is a regression in some + // noisy clips with level 3. + if (cpi->noise_estimate.enabled && cm->width >= 1280 && + cm->height >= 720) { + NOISE_LEVEL noise_level = + vp9_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level >= kMedium) sf->short_circuit_low_temp_var = 2; + } + // Since the short_circuit_low_temp_var is used, reduce the + // adaptive_rd_thresh level. + if (cm->width * cm->height > 352 * 288) + sf->adaptive_rd_thresh = 1; + else + sf->adaptive_rd_thresh = 2; } sf->limit_newmv_early_exit = 0; - sf->bias_golden = 0; + sf->use_simple_block_yrd = 1; + } + if (sf->use_altref_onepass) { + if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) { + sf->partition_search_type = FIXED_PARTITION; + sf->always_this_block_size = BLOCK_64X64; + } + if (cpi->count_arf_frame_usage == NULL) + cpi->count_arf_frame_usage = + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->count_arf_frame_usage)); + if (cpi->count_lastgolden_frame_usage == NULL) + cpi->count_lastgolden_frame_usage = + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->count_lastgolden_frame_usage)); } } @@ -514,6 +674,12 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { RD_OPT *const rd = &cpi->rd; int i; + // best quality defaults + // Some speed-up features even for best quality as minimal impact on quality. + sf->partition_search_breakout_thr.dist = (1 << 19); + sf->partition_search_breakout_thr.rate = 80; + sf->ml_partition_search_early_termination = 0; + if (oxcf->mode == REALTIME) { set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); } else if (oxcf->mode == GOOD) { @@ -535,6 +701,21 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { rd->thresh_mult_sub8x8[i] = INT_MAX; } } + + // With row based multi-threading, the following speed features + // have to be disabled to guarantee that bitstreams encoded with single thread + // and multiple threads match. + // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since + // adaptive_rd_thresh is defined per-row for non-rd pickmode. + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && + oxcf->max_threads > 1) + sf->adaptive_rd_thresh = 0; + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; } void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { @@ -589,6 +770,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->tx_domain_thresh = 99.0; sf->allow_quant_coeff_opt = sf->optimize_coefficients; sf->quant_opt_thresh = 99.0; + sf->allow_acl = 1; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; @@ -626,45 +808,25 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; sf->tx_size_search_breakout = 1; - sf->partition_search_breakout_dist_thr = (1 << 19); - sf->partition_search_breakout_rate_thr = 80; - if (oxcf->mode == REALTIME) - set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content); - else if (oxcf->mode == GOOD) - set_good_speed_feature(cpi, cm, sf, oxcf->speed); - - cpi->full_search_sad = vp9_full_search_sad; - cpi->diamond_search_sad = vp9_diamond_search_sad; - - sf->allow_exhaustive_searches = 1; - if (oxcf->mode == BEST) { - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) - sf->exhaustive_searches_thresh = (1 << 20); - else - sf->exhaustive_searches_thresh = (1 << 21); - sf->max_exaustive_pct = 100; + sf->exhaustive_searches_thresh = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) + : INT_MAX; + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { for (i = 0; i < MAX_MESH_STEP; ++i) { sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range; sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval; } - } else { - int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed; - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) - sf->exhaustive_searches_thresh = (1 << 22); - else - sf->exhaustive_searches_thresh = (1 << 23); - sf->max_exaustive_pct = good_quality_max_mesh_pct[speed]; - if (speed > 0) - sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1; - - for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range; - sf->mesh_patterns[i].interval = - good_quality_mesh_patterns[speed][i].interval; - } } + if (oxcf->mode == REALTIME) + set_rt_speed_feature_framesize_independent(cpi, sf, oxcf->speed, + oxcf->content); + else if (oxcf->mode == GOOD) + set_good_speed_feature_framesize_independent(cpi, cm, sf, oxcf->speed); + + cpi->diamond_search_sad = vp9_diamond_search_sad; + // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. if (oxcf->pass == 1) sf->optimize_coefficients = 0; @@ -696,4 +858,19 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { if (!cpi->oxcf.frame_periodic_boost) { sf->max_delta_qindex = 0; } + + // With row based multi-threading, the following speed features + // have to be disabled to guarantee that bitstreams encoded with single thread + // and multiple threads match. + // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since + // adaptive_rd_thresh is defined per-row for non-rd pickmode. + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && + oxcf->max_threads > 1) + sf->adaptive_rd_thresh = 0; + + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; } diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.h b/libs/libvpx/vp9/encoder/vp9_speed_features.h index 6d0b9420a1..50d52bc23a 100644 --- a/libs/libvpx/vp9/encoder/vp9_speed_features.h +++ b/libs/libvpx/vp9/encoder/vp9_speed_features.h @@ -193,6 +193,11 @@ typedef struct MV_SPEED_FEATURES { int fullpel_search_step_param; } MV_SPEED_FEATURES; +typedef struct PARTITION_SEARCH_BREAKOUT_THR { + int64_t dist; + int rate; +} PARTITION_SEARCH_BREAKOUT_THR; + #define MAX_MESH_STEP 4 typedef struct MESH_PATTERN { @@ -226,8 +231,13 @@ typedef struct SPEED_FEATURES { // This variable is used to cap the maximum number of times we skip testing a // mode to be evaluated. A high value means we will be faster. + // Turned off when (row_mt_bit_exact == 1 && adaptive_rd_thresh_row_mt == 0). int adaptive_rd_thresh; + // Flag to use adaptive_rd_thresh when row-mt it enabled, only for non-rd + // pickmode. + int adaptive_rd_thresh_row_mt; + // Enables skipping the reconstruction step (idct, recon) in the // intermediate steps assuming the last frame didn't have too many intra // blocks and the q is less than a threshold. @@ -244,6 +254,10 @@ typedef struct SPEED_FEATURES { int allow_quant_coeff_opt; double quant_opt_thresh; + // Enable asymptotic closed-loop encoding decision for key frame and + // alternate reference frames. + int allow_acl; + // Use transform domain distortion. Use pixel domain distortion in speed 0 // and certain situations in higher speed to improve the RD model precision. int allow_txfm_domain_distortion; @@ -313,15 +327,9 @@ typedef struct SPEED_FEATURES { // point for this motion search and limits the search range around it. int adaptive_motion_search; - // Flag for allowing some use of exhaustive searches; - int allow_exhaustive_searches; - // Threshold for allowing exhaistive motion search. int exhaustive_searches_thresh; - // Maximum number of exhaustive searches for a frame. - int max_exaustive_pct; - // Pattern to be used for any exhaustive mesh searches. MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; @@ -438,8 +446,10 @@ typedef struct SPEED_FEATURES { INTERP_FILTER_MASK interp_filter_search_mask; // Partition search early breakout thresholds. - int64_t partition_search_breakout_dist_thr; - int partition_search_breakout_rate_thr; + PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr; + + // Machine-learning based partition search early termination + int ml_partition_search_early_termination; // Allow skipping partition search for still image frame int allow_partition_search_skip; @@ -452,11 +462,13 @@ typedef struct SPEED_FEATURES { int short_circuit_flat_blocks; // Skip a number of expensive mode evaluations for blocks with very low - // temporal variance. - // 1: Skip golden non-zeromv and ALL INTRA for bsize >= 32x32. + // temporal variance. If the low temporal variance flag is set for a block, + // do the following: + // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32. // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and // 32x16. + // 3: Same as (2), but also skip golden zeromv. int short_circuit_low_temp_var; // Limits the rd-threshold update for early exit for the newmv-last mode, @@ -469,6 +481,33 @@ typedef struct SPEED_FEATURES { // Bias to use base mv and skip 1/4 subpel search when use base mv in // enhancement layer. int base_mv_aggressive; + + // Global flag to enable partition copy from the previous frame. + int copy_partition_flag; + + // Compute the source sad for every superblock of the frame, + // prior to encoding the frame, to be used to bypass some encoder decisions. + int use_source_sad; + + int use_simple_block_yrd; + + // If source sad of superblock is high (> adapt_partition_thresh), will switch + // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition + // based on the nonrd-pickmode). + int adapt_partition_source_sad; + int adapt_partition_thresh; + + // Enable use of alt-refs in 1 pass VBR. + int use_altref_onepass; + + // Enable use of compound prediction, for nonrd_pickmode with nonzero lag. + int use_compound_nonrd_pickmode; + + // Always use nonrd_pick_intra for all block sizes on keyframes. + int nonrd_keyframe; + + // For SVC: enables use of partition from lower spatial resolution. + int svc_use_lowres_part; } SPEED_FEATURES; struct VP9_COMP; diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c index 2d29e268b1..2636bd9a58 100644 --- a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -36,12 +36,16 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->scaled_temp_is_alloc = 0; svc->scaled_one_half = 0; svc->current_superframe = 0; + svc->non_reference_frame = 0; + for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { - cpi->svc.ext_frame_flags[sl] = 0; - cpi->svc.ext_lst_fb_idx[sl] = 0; - cpi->svc.ext_gld_fb_idx[sl] = 1; - cpi->svc.ext_alt_fb_idx[sl] = 2; + svc->ext_frame_flags[sl] = 0; + svc->ext_lst_fb_idx[sl] = 0; + svc->ext_gld_fb_idx[sl] = 1; + svc->ext_alt_fb_idx[sl] = 2; + svc->downsample_filter_type[sl] = EIGHTTAP; + svc->downsample_filter_phase[sl] = 0; // Set to 8 for averaging filter. } if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { @@ -171,7 +175,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, RATE_CONTROL *const lrc = &lc->rc; lc->spatial_layer_target_bandwidth = spatial_layer_target; - bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target; + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; lrc->starting_buffer_level = (int64_t)(rc->starting_buffer_level * bitrate_alloc); lrc->optimal_buffer_level = @@ -349,6 +353,7 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { } } +#if !CONFIG_REALTIME_ONLY void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; int i; @@ -364,6 +369,7 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { } svc->spatial_layer_id = 0; } +#endif // !CONFIG_REALTIME_ONLY void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { LAYER_CONTEXT *const lc = @@ -384,9 +390,9 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { .is_key_frame; } -static void get_layer_resolution(const int width_org, const int height_org, - const int num, const int den, int *width_out, - int *height_out) { +void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out) { int w, h; if (width_out == NULL || height_out == NULL || den == 0) return; @@ -601,6 +607,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { LAYER_CONTEXT *lc = NULL; if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1; cpi->svc.force_zero_mode_spatial_ref = 1; + cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride; if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { set_flags_and_fb_idx_for_temporal_mode3(cpi); @@ -650,6 +657,37 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); + // For resolutions <= VGA: set phase of the filter = 8 (for symmetric + // averaging filter), use bilinear for now. + if (width * height <= 640 * 480) { + cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR; + cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8; + } + + // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use + // of base motion vectors if spatial scale factors for any layers are not 2, + // keep the case of 3 spatial layers with scale factor of 4x4 for base layer. + // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2. + if (cpi->svc.number_spatial_layers > 1) { + int sl; + for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) { + lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id]; + if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) && + !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 && + cpi->svc.number_spatial_layers == 3)) { + cpi->svc.use_base_mv = 0; + break; + } + } + } + + cpi->svc.non_reference_frame = 0; + if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame && + !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) { + cpi->svc.non_reference_frame = 1; + } + if (vp9_set_size_literal(cpi, width, height) != 0) return VPX_CODEC_INVALID_PARAM; @@ -824,3 +862,28 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { vp9_update_temporal_layer_framerate(cpi); vp9_restore_layer_context(cpi); } + +void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { + SVC *svc = &cpi->svc; + int sl, tl; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + // Check for reset based on avg_frame_bandwidth for spatial layer sl. + int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) || + lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) { + // Reset for all temporal layers with spatial layer sl. + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lrc->bits_off_target = lrc->optimal_buffer_level; + lrc->buffer_level = lrc->optimal_buffer_level; + } + } + } +} diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h index ee7a6638b4..b7cdfd9623 100644 --- a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -49,7 +49,7 @@ typedef struct { uint8_t speed; } LAYER_CONTEXT; -typedef struct { +typedef struct SVC { int spatial_layer_id; int temporal_layer_id; int number_spatial_layers; @@ -87,7 +87,20 @@ typedef struct { int ref_frame_index[REF_FRAMES]; int force_zero_mode_spatial_ref; int current_superframe; + int non_reference_frame; int use_base_mv; + // Used to control the downscaling filter for source scaling, for 1 pass CBR. + // downsample_filter_phase: = 0 will do sub-sampling (no weighted average), + // = 8 will center the target pixel and get a symmetric averaging filter. + // downsample_filter_type: 4 filters may be used: eighttap_regular, + // eighttap_smooth, eighttap_sharp, and bilinear. + INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS]; + int downsample_filter_phase[VPX_SS_MAX_LAYERS]; + + BLOCK_SIZE *prev_partition_svc; + int mi_stride[VPX_MAX_LAYERS]; + + int first_layer_denoise; } SVC; struct VP9_COMP; @@ -117,6 +130,10 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi); // Initialize second pass rc for spatial svc. void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi); +void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out); + // Increment number of video frames in layer void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi); @@ -137,6 +154,8 @@ void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi); +void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c index a167eeb15d..2758c42aeb 100644 --- a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -8,13 +8,17 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" @@ -51,16 +55,19 @@ static void temporal_filter_predictors_mb_c( #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, - 16, 16, which_mv, kernel, MV_PRECISION_Q3, - x, y, xd->bd); + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride, + CONVERT_TO_SHORTPTR(&pred[0]), 16, &mv, + scale, 16, 16, which_mv, kernel, + MV_PRECISION_Q3, x, y, xd->bd); - vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[256]), uv_block_width, &mv, scale, uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, y, xd->bd); - vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[512]), uv_block_width, &mv, scale, uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, y, xd->bd); @@ -87,16 +94,23 @@ void vp9_temporal_filter_init(void) { for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i; } -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, - uint8_t *frame2, unsigned int block_width, +void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, + const uint8_t *frame2, + unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, + int filter_weight, uint32_t *accumulator, uint16_t *count) { unsigned int i, j, k; int modifier; int byte = 0; const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + assert(strength >= 0); + assert(strength <= 6); + + assert(filter_weight >= 0); + assert(filter_weight <= 2); + for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { int pixel_value = *frame2; @@ -150,11 +164,11 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_temporal_filter_apply_c( - uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8, + const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8, unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, uint16_t *count) { - uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); - uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); + int filter_weight, uint32_t *accumulator, uint16_t *count) { + const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); + const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); unsigned int i, j, k; int modifier; int byte = 0; @@ -208,24 +222,25 @@ void vp9_highbd_temporal_filter_apply_c( } #endif // CONFIG_VP9_HIGHBITDEPTH -static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, - uint8_t *arf_frame_buf, - uint8_t *frame_ptr_buf, - int stride) { - MACROBLOCK *const x = &cpi->td.mb; +static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, + ThreadData *td, + uint8_t *arf_frame_buf, + uint8_t *frame_ptr_buf, + int stride, MV *ref_mv) { + MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS old_search_method = mv_sf->search_method; + const SEARCH_METHODS search_method = HEX; int step_param; int sadpb = x->sadperbit16; - int bestsme = INT_MAX; + uint32_t bestsme = UINT_MAX; uint32_t distortion; uint32_t sse; int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; MV best_ref_mv1 = { 0, 0 }; MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv; // Save input state struct buf_2d src = x->plane[0].src; @@ -243,11 +258,14 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, step_param = mv_sf->reduce_first_step_size; step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - mv_sf->search_method = HEX; + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param, - sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1, - ref_mv, 0, 0); - mv_sf->search_method = old_search_method; + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, ref_mv, 0, 0); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; // Ignore mv costing by sending NULL pointer instead of cost array bestsme = cpi->find_fractional_mv_step( @@ -263,22 +281,24 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, return bestsme; } -static void temporal_filter_iterate_c(VP9_COMP *cpi, - YV12_BUFFER_CONFIG **frames, - int frame_count, int alt_ref_index, - int strength, - struct scale_factors *scale) { +void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, + int mb_row, int mb_col_start, + int mb_col_end) { + ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data; + YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames; + int frame_count = arnr_filter_data->frame_count; + int alt_ref_index = arnr_filter_data->alt_ref_index; + int strength = arnr_filter_data->strength; + struct scale_factors *scale = &arnr_filter_data->sf; int byte; int frame; - int mb_col, mb_row; + int mb_col; unsigned int filter_weight; int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; - int mb_y_offset = 0; - int mb_uv_offset = 0; - DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint32_t, accumulator[16 * 16 * 3]); DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); - MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; + MACROBLOCKD *mbd = &td->mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; uint8_t *dst1, *dst2; #if CONFIG_VP9_HIGHBITDEPTH @@ -290,10 +310,11 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, #endif const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; + // Addition of the tile col level offsets + int mb_y_offset = mb_row * 16 * (f->y_stride) + 16 * mb_col_start; + int mb_uv_offset = + mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start; - // Save input state - uint8_t *input_buffer[MAX_MB_PLANE]; - int i; #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { predictor = CONVERT_TO_BYTEPTR(predictor16); @@ -302,215 +323,186 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, } #endif - for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; + // Source frames are extended to 16 pixels. This is different than + // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) + // A 6/8 tap filter is used for motion search. This requires 2 pixels + // before and 3 pixels after. So the largest Y mv on a border would + // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the + // Y and therefore only extended by 8. The largest mv that a UV block + // can support is 8 - VP9_INTERP_EXTEND. A UV mv is half of a Y mv. + // (16 - VP9_INTERP_EXTEND) >> 1 which is greater than + // 8 - VP9_INTERP_EXTEND. + // To keep the mv in play for both Y and UV planes the max that it + // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1). + td->mb.mv_limits.row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.row_max = + ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); - for (mb_row = 0; mb_row < mb_rows; mb_row++) { - // Source frames are extended to 16 pixels. This is different than - // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) - // A 6/8 tap filter is used for motion search. This requires 2 pixels - // before and 3 pixels after. So the largest Y mv on a border would - // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the - // Y and therefore only extended by 8. The largest mv that a UV block - // can support is 8 - VP9_INTERP_EXTEND. A UV mv is half of a Y mv. - // (16 - VP9_INTERP_EXTEND) >> 1 which is greater than - // 8 - VP9_INTERP_EXTEND. - // To keep the mv in play for both Y and UV planes the max that it - // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1). - cpi->td.mb.mv_limits.row_min = - -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); - cpi->td.mb.mv_limits.row_max = - ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) { + int i, j, k; + int stride; + MV ref_mv; - for (mb_col = 0; mb_col < mb_cols; mb_col++) { - int i, j, k; - int stride; + vp9_zero_array(accumulator, 16 * 16 * 3); + vp9_zero_array(count, 16 * 16 * 3); - memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); - memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); + td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.col_max = + ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); - cpi->td.mb.mv_limits.col_min = - -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); - cpi->td.mb.mv_limits.col_max = - ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + if (cpi->oxcf.content == VP9E_CONTENT_FILM) { + unsigned int src_variance; + struct buf_2d src; - for (frame = 0; frame < frame_count; frame++) { - const int thresh_low = 10000; - const int thresh_high = 20000; - - if (frames[frame] == NULL) continue; - - mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0; - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0; - - if (frame == alt_ref_index) { - filter_weight = 2; - } else { - // Find best match in this frame by MC - int err = temporal_filter_find_matching_mb_c( - cpi, frames[alt_ref_index]->y_buffer + mb_y_offset, - frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride); - - // Assign higher weight to matching MB if its error - // score is lower. If not applying MC default behavior - // is to weight all MBs equal. - filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; - } - - if (filter_weight != 0) { - // Construct the predictors - temporal_filter_predictors_mb_c( - mbd, frames[frame]->y_buffer + mb_y_offset, - frames[frame]->u_buffer + mb_uv_offset, - frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, - mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale, - mb_col * 16, mb_row * 16); - -#if CONFIG_VP9_HIGHBITDEPTH - if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - int adj_strength = strength + 2 * (mbd->bd - 8); - // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply_c( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, - adj_strength, filter_weight, accumulator, count); - vp9_highbd_temporal_filter_apply_c( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 256, count + 256); - vp9_highbd_temporal_filter_apply_c( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 512, count + 512); - } else { - // Apply the filter (YUV) - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, - filter_weight, accumulator, count); - vp9_temporal_filter_apply_c( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, strength, filter_weight, - accumulator + 256, count + 256); - vp9_temporal_filter_apply_c( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, strength, filter_weight, - accumulator + 512, count + 512); - } -#else - // Apply the filter (YUV) - // TODO(jingning): Need SIMD optimization for this. - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, - filter_weight, accumulator, count); - vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_width, - mb_uv_height, strength, filter_weight, - accumulator + 256, count + 256); - vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_width, - mb_uv_height, strength, filter_weight, - accumulator + 512, count + 512); -#endif // CONFIG_VP9_HIGHBITDEPTH - } - } + src.buf = f->y_buffer + mb_y_offset; + src.stride = f->y_stride; #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *dst1_16; - uint16_t *dst2_16; - // Normalize filter output to produce AltRef frame - dst1 = cpi->alt_ref_buffer.y_buffer; - dst1_16 = CONVERT_TO_SHORTPTR(dst1); - stride = cpi->alt_ref_buffer.y_stride; - byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - - dst1_16[byte] = (uint16_t)pval; - - // move to next pixel - byte++; - } - - byte += stride - 16; - } - - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - dst1_16 = CONVERT_TO_SHORTPTR(dst1); - dst2_16 = CONVERT_TO_SHORTPTR(dst2); - stride = cpi->alt_ref_buffer.uv_stride; - byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; - - // U - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - dst1_16[byte] = (uint16_t)pval; - - // V - pval = accumulator[m] + (count[m] >> 1); - pval *= fixed_divide[count[m]]; - pval >>= 19; - dst2_16[byte] = (uint16_t)pval; - - // move to next pixel - byte++; - } - - byte += stride - mb_uv_width; - } + src_variance = + vp9_high_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16, mbd->bd); } else { - // Normalize filter output to produce AltRef frame - dst1 = cpi->alt_ref_buffer.y_buffer; - stride = cpi->alt_ref_buffer.y_stride; - byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - - dst1[byte] = (uint8_t)pval; - - // move to next pixel - byte++; - } - byte += stride - 16; - } - - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - stride = cpi->alt_ref_buffer.uv_stride; - byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; - - // U - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - dst1[byte] = (uint8_t)pval; - - // V - pval = accumulator[m] + (count[m] >> 1); - pval *= fixed_divide[count[m]]; - pval >>= 19; - dst2[byte] = (uint8_t)pval; - - // move to next pixel - byte++; - } - byte += stride - mb_uv_width; - } + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16); } #else + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2); + } + + for (frame = 0; frame < frame_count; frame++) { + const uint32_t thresh_low = 10000; + const uint32_t thresh_high = 20000; + + if (frames[frame] == NULL) continue; + + ref_mv.row = 0; + ref_mv.col = 0; + + if (frame == alt_ref_index) { + filter_weight = 2; + } else { + // Find best match in this frame by MC + uint32_t err = temporal_filter_find_matching_mb_c( + cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset, + frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, + &ref_mv); + + // Assign higher weight to matching MB if its error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; + } + + if (filter_weight != 0) { + // Construct the predictors + temporal_filter_predictors_mb_c( + mbd, frames[frame]->y_buffer + mb_y_offset, + frames[frame]->u_buffer + mb_uv_offset, + frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, + mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale, + mb_col * 16, mb_row * 16); + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int adj_strength = strength + 2 * (mbd->bd - 8); + // Apply the filter (YUV) + vp9_highbd_temporal_filter_apply( + f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, + adj_strength, filter_weight, accumulator, count); + vp9_highbd_temporal_filter_apply( + f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, + mb_uv_width, mb_uv_height, adj_strength, filter_weight, + accumulator + 256, count + 256); + vp9_highbd_temporal_filter_apply( + f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, + mb_uv_width, mb_uv_height, adj_strength, filter_weight, + accumulator + 512, count + 512); + } else { + // Apply the filter (YUV) + vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, strength, filter_weight, + accumulator, count); + vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, + predictor + 256, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 256, + count + 256); + vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, + predictor + 512, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 512, + count + 512); + } +#else + // Apply the filter (YUV) + vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, strength, filter_weight, + accumulator, count); + vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, + predictor + 256, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 256, + count + 256); + vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, + predictor + 512, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 512, + count + 512); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *dst1_16; + uint16_t *dst2_16; + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < 16; i++) { + for (j = 0; j < 16; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + + dst1_16[byte] = (uint16_t)pval; + + // move to next pixel + byte++; + } + + byte += stride - 16; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + dst2_16 = CONVERT_TO_SHORTPTR(dst2); + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; + + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1_16[byte] = (uint16_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2_16[byte] = (uint16_t)pval; + + // move to next pixel + byte++; + } + + byte += stride - mb_uv_width; + } + } else { // Normalize filter output to produce AltRef frame dst1 = cpi->alt_ref_buffer.y_buffer; stride = cpi->alt_ref_buffer.y_stride; @@ -554,12 +546,93 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, } byte += stride - mb_uv_width; } -#endif // CONFIG_VP9_HIGHBITDEPTH - mb_y_offset += 16; - mb_uv_offset += mb_uv_width; } - mb_y_offset += 16 * (f->y_stride - mb_cols); - mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; +#else + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < 16; i++) { + for (j = 0; j < 16; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + + dst1[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - 16; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; + + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1[byte] = (uint8_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + mb_y_offset += 16; + mb_uv_offset += mb_uv_width; + } +} + +static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row, + int tile_col) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileInfo *tile_info = + &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + const int mb_row_start = (tile_info->mi_row_start) >> 1; + const int mb_row_end = (tile_info->mi_row_end + 1) >> 1; + const int mb_col_start = (tile_info->mi_col_start) >> 1; + const int mb_col_end = (tile_info->mi_col_end + 1) >> 1; + int mb_row; + + for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) { + vp9_temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start, + mb_col_end); + } +} + +static void temporal_filter_iterate_c(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_row, tile_col; + MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; + // Save input state + uint8_t *input_buffer[MAX_MB_PLANE]; + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; + + vp9_init_tile_data(cpi); + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + temporal_filter_iterate_tile_c(cpi, tile_row, tile_col); + } } // Restore input state @@ -638,14 +711,16 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data; int frame; int frames_to_blur; int start_frame; int strength; int frames_to_blur_backward; int frames_to_blur_forward; - struct scale_factors sf; - YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; + struct scale_factors *sf = &arnr_filter_data->sf; + YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames; + int rdmult; // Apply context specific adjustments to the arnr filter parameters. adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength); @@ -653,6 +728,10 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { frames_to_blur_forward = ((frames_to_blur - 1) / 2); start_frame = distance + frames_to_blur_forward; + arnr_filter_data->strength = strength; + arnr_filter_data->frame_count = frames_to_blur; + arnr_filter_data->alt_ref_index = frames_to_blur_backward; + // Setup frame pointers, NULL indicates frame not included in filter. for (frame = 0; frame < frames_to_blur; ++frame) { const int which_buffer = start_frame - frame; @@ -670,13 +749,13 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { int frame_used = 0; #if CONFIG_VP9_HIGHBITDEPTH vp9_setup_scale_factors_for_frame( - &sf, get_frame_new_buffer(cm)->y_crop_width, + sf, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, cm->use_highbitdepth); #else vp9_setup_scale_factors_for_frame( - &sf, get_frame_new_buffer(cm)->y_crop_width, + sf, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height); @@ -697,7 +776,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { "Failed to reallocate alt_ref_buffer"); } frames[frame] = vp9_scale_if_required( - cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0); + cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0, + EIGHTTAP, 0); ++frame_used; } } @@ -708,17 +788,25 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { // ARF is produced at the native frame size and resized when coded. #if CONFIG_VP9_HIGHBITDEPTH vp9_setup_scale_factors_for_frame( - &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height, cm->use_highbitdepth); #else vp9_setup_scale_factors_for_frame( - &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height); #endif // CONFIG_VP9_HIGHBITDEPTH } } - temporal_filter_iterate_c(cpi, frames, frames_to_blur, - frames_to_blur_backward, strength, &sf); + // Initialize errorperbit and sabperbit. + rdmult = (int)vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX); + if (rdmult < 1) rdmult = 1; + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX); + + if (!cpi->row_mt) + temporal_filter_iterate_c(cpi); + else + vp9_temporal_filter_row_mt(cpi); } diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h index f537b8870a..775e49cc53 100644 --- a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h +++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h @@ -15,9 +15,15 @@ extern "C" { #endif +#define ARNR_FILT_QINDEX 128 + void vp9_temporal_filter_init(void); void vp9_temporal_filter(VP9_COMP *cpi, int distance); +void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, + int mb_row, int mb_col_start, + int mb_col_end); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libs/libvpx/vp9/encoder/vp9_tokenize.c b/libs/libvpx/vp9/encoder/vp9_tokenize.c index dc2616dbe1..814d769be3 100644 --- a/libs/libvpx/vp9/encoder/vp9_tokenize.c +++ b/libs/libvpx/vp9/encoder/vp9_tokenize.c @@ -123,7 +123,7 @@ const int16_t vp9_cat6_low_cost[256] = { 6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831, 6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982 }; -const int vp9_cat6_high_cost[64] = { +const uint16_t vp9_cat6_high_cost[64] = { 88, 2251, 2727, 4890, 3148, 5311, 5787, 7950, 3666, 5829, 6305, 8468, 6726, 8889, 9365, 11528, 3666, 5829, 6305, 8468, 6726, 8889, 9365, 11528, 7244, 9407, 9883, 12046, 10304, 12467, 12943, 15106, 3666, @@ -133,7 +133,7 @@ const int vp9_cat6_high_cost[64] = { }; #if CONFIG_VP9_HIGHBITDEPTH -const int vp9_cat6_high10_high_cost[256] = { +const uint16_t vp9_cat6_high10_high_cost[256] = { 94, 2257, 2733, 4896, 3154, 5317, 5793, 7956, 3672, 5835, 6311, 8474, 6732, 8895, 9371, 11534, 3672, 5835, 6311, 8474, 6732, 8895, 9371, 11534, 7250, 9413, 9889, 12052, 10310, 12473, 12949, 15112, 3672, @@ -159,7 +159,7 @@ const int vp9_cat6_high10_high_cost[256] = { 18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074, 24237, 24713, 26876 }; -const int vp9_cat6_high12_high_cost[1024] = { +const uint16_t vp9_cat6_high12_high_cost[1024] = { 100, 2263, 2739, 4902, 3160, 5323, 5799, 7962, 3678, 5841, 6317, 8480, 6738, 8901, 9377, 11540, 3678, 5841, 6317, 8480, 6738, 8901, 9377, 11540, 7256, 9419, 9895, 12058, 10316, 12479, 12955, 15118, 3678, diff --git a/libs/libvpx/vp9/encoder/vp9_tokenize.h b/libs/libvpx/vp9/encoder/vp9_tokenize.h index c905715d7d..b2f63ffef5 100644 --- a/libs/libvpx/vp9/encoder/vp9_tokenize.h +++ b/libs/libvpx/vp9/encoder/vp9_tokenize.h @@ -76,25 +76,18 @@ extern const TOKENVALUE *vp9_dct_value_tokens_ptr; extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens; extern const int *vp9_dct_cat_lt_10_value_cost; extern const int16_t vp9_cat6_low_cost[256]; -extern const int vp9_cat6_high_cost[64]; -extern const int vp9_cat6_high10_high_cost[256]; -extern const int vp9_cat6_high12_high_cost[1024]; -static INLINE int vp9_get_cost(int16_t token, EXTRABIT extrabits, - const int *cat6_high_table) { - if (token != CATEGORY6_TOKEN) - return vp9_extra_bits[token].cost[extrabits >> 1]; - return vp9_cat6_low_cost[(extrabits >> 1) & 0xff] + - cat6_high_table[extrabits >> 9]; -} +extern const uint16_t vp9_cat6_high_cost[64]; +extern const uint16_t vp9_cat6_high10_high_cost[256]; +extern const uint16_t vp9_cat6_high12_high_cost[1024]; #if CONFIG_VP9_HIGHBITDEPTH -static INLINE const int *vp9_get_high_cost_table(int bit_depth) { +static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) { return bit_depth == 8 ? vp9_cat6_high_cost : (bit_depth == 10 ? vp9_cat6_high10_high_cost : vp9_cat6_high12_high_cost); } #else -static INLINE const int *vp9_get_high_cost_table(int bit_depth) { +static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) { (void)bit_depth; return vp9_cat6_high_cost; } @@ -118,7 +111,7 @@ static INLINE int16_t vp9_get_token(int v) { } static INLINE int vp9_get_token_cost(int v, int16_t *token, - const int *cat6_high_table) { + const uint16_t *cat6_high_table) { if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { EXTRABIT extrabits; *token = CATEGORY6_TOKEN; diff --git a/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c new file mode 100644 index 0000000000..460dab6593 --- /dev/null +++ b/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values +// which may be bound by the edges of the block being filtered. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 + +// Load values from 'a' and 'b'. Compute the difference squared and sum +// neighboring values such that: +// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 +// Values to the left and right of the row are set to 0. +// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. +static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { + const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); + + const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); + const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); + + const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); + const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); + + // Shift all the values one place to the left/right so we can efficiently sum + // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. + const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); + const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); + + // It becomes necessary to treat the values as unsigned at this point. The + // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point + // forward since the filter is only applied to smooth small pixel changes. + // Once the value has saturated to uint16_t it is well outside the useful + // range. + __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum = sum_u16; +} + +static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, + __m128i *sum_1) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); + const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); + const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); + const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); + const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); + + const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); + const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); + const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); + const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); + + __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); + // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. + __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); + + __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum_0 = sum_u16; + + shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); + shift_right = _mm_srli_si128(diff_sq_1_u16, 2); + + sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); + sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + + *sum_1 = sum_u16; +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static __m128i average_8(__m128i sum, const __m128i mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, + const __m128i mul_constants_0, + const __m128i mul_constants_1, const int strength, + const int rounding, const int weight) { + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + __m128i input_0, input_1; + + input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); + input_0 = _mm_adds_epu16(input_0, rounding_u16); + + input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); + input_1 = _mm_adds_epu16(input_1, rounding_u16); + + input_0 = _mm_srl_epi16(input_0, strength_u128); + input_1 = _mm_srl_epi16(input_1, strength_u128); + + input_0 = _mm_min_epu16(input_0, sixteen); + input_1 = _mm_min_epu16(input_1, sixteen); + input_0 = _mm_sub_epi16(sixteen, input_0); + input_1 = _mm_sub_epi16(sixteen, input_1); + + *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); + *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, + uint16_t *count, uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), + count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); + __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), + pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); + __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; + __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); + _mm_storeu_si128((__m128i *)count, count_0_u16); + + count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); + _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); + + pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); + pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); + pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); + pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); + accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); + accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); + _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); + _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); +} + +void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, + const uint8_t *b, unsigned int width, + unsigned int height, int strength, + int weight, uint32_t *accumulator, + uint16_t *count) { + unsigned int h; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + + assert(strength >= 0); + assert(strength <= 6); + + assert(weight >= 0); + assert(weight <= 2); + + assert(width == 8 || width == 16); + + if (width == 8) { + __m128i sum_row_a, sum_row_b, sum_row_c; + __m128i mul_constants = _mm_setr_epi16( + NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + + sum_8(a, b, &sum_row_a); + sum_8(a + stride, b + width, &sum_row_b); + sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_c, b, count, accumulator); + + a += stride + stride; + b += width; + count += width; + accumulator += width; + + mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); + + for (h = 0; h < height - 2; ++h) { + sum_8(a, b + width, &sum_row_c); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c); + sum_row_a = + average_8(sum_row_a, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_a, b, count, accumulator); + + a += stride; + b += width; + count += width; + accumulator += width; + + sum_row_a = sum_row_b; + sum_row_b = sum_row_c; + } + + mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); + sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); + accumulate_and_store_8(sum_row_a, b, count, accumulator); + + } else { // width == 16 + __m128i sum_row_a_0, sum_row_a_1; + __m128i sum_row_b_0, sum_row_b_1; + __m128i sum_row_c_0, sum_row_c_1; + __m128i mul_constants_0 = _mm_setr_epi16( + NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6), + mul_constants_1 = _mm_setr_epi16( + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + + sum_16(a, b, &sum_row_a_0, &sum_row_a_1); + sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1); + + sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + + average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); + + a += stride + stride; + b += width; + count += width; + accumulator += width; + + mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9); + mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, + NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); + for (h = 0; h < height - 2; ++h) { + sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1); + + sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0); + sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1); + + average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator); + + a += stride; + b += width; + count += width; + accumulator += width; + + sum_row_a_0 = sum_row_b_0; + sum_row_a_1 = sum_row_b_1; + sum_row_b_0 = sum_row_c_0; + sum_row_b_1 = sum_row_c_1; + } + + mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6); + mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); + sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); + sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); + + average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, + strength, rounding, weight); + accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); + } +} diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c index 0712779b75..dbd243ac10 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/x86/fwd_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" @@ -71,7 +72,7 @@ static INLINE void transpose_4x4(__m128i *res) { } static void fdct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); @@ -181,19 +182,19 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; int pass; + // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -215,8 +216,6 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, int index = 0; (void)scan_ptr; - (void)zbin_ptr; - (void)quant_shift_ptr; (void)coeff_ptr; // Pre-condition input (shift by two) @@ -708,61 +707,9 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, store_output(&res[7], (output + 7 * stride)); } -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - static void fdct8_sse2(__m128i *in) { // constants - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -897,7 +844,7 @@ static void fdct8_sse2(__m128i *in) { in[7] = _mm_packs_epi32(v6, v7); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } static void fadst8_sse2(__m128i *in) { @@ -914,7 +861,7 @@ static void fadst8_sse2(__m128i *in) { const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__const_0 = _mm_set1_epi16(0); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -1127,7 +1074,7 @@ static void fadst8_sse2(__m128i *in) { in[7] = _mm_sub_epi16(k__const_0, s1); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, @@ -1184,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, write_buffer_8x8(output + 8 * stride, in1 + 8, stride); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { // perform rounding operations right_shift_8x8(res0, 2); @@ -1212,7 +1142,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { static void fdct16_8col(__m128i *in) { // perform 16x16 1-D DCT for 8 columns __m128i i[8], s[8], p[8], t[8], u[16], v[16]; - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); @@ -1559,8 +1489,8 @@ static void fadst16_8col(__m128i *in) { const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -2004,13 +1934,13 @@ static void fadst16_8col(__m128i *in) { static void fdct16_sse2(__m128i *in0, __m128i *in1) { fdct16_8col(in0); fdct16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } static void fadst16_sse2(__m128i *in0, __m128i *in1) { fadst16_8col(in0); fadst16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm index ced37bd16e..8152dce864 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm +++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm @@ -11,6 +11,7 @@ %define private_prefix vp9 %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text @@ -62,25 +63,7 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride psllw m0, 2 psllw m1, 2 -%if CONFIG_VP9_HIGHBITDEPTH - ; sign extension - mova m2, m0 - mova m3, m1 - punpcklwd m0, m0 - punpcklwd m1, m1 - punpckhwd m2, m2 - punpckhwd m3, m3 - psrad m0, 16 - psrad m1, 16 - psrad m2, 16 - psrad m3, 16 - mova [outputq], m0 - mova [outputq + 16], m2 - mova [outputq + 32], m1 - mova [outputq + 48], m3 -%else - mova [outputq], m0 - mova [outputq + 16], m1 -%endif + STORE_TRAN_LOW 0, outputq, 0, 2, 3 + STORE_TRAN_LOW 1, outputq, 8, 2, 3 RET diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c index fb2a925417..bf874a09ec 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c @@ -12,23 +12,26 @@ #include // SSSE3 #include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" void vp9_fdct8x8_quant_ssse3( - const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; int pass; + // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -50,8 +53,6 @@ void vp9_fdct8x8_quant_ssse3( int index = 0; (void)scan_ptr; - (void)zbin_ptr; - (void)quant_shift_ptr; (void)coeff_ptr; // Pre-condition input (shift by two) @@ -328,15 +329,15 @@ void vp9_fdct8x8_quant_ssse3( qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -398,20 +399,21 @@ void vp9_fdct8x8_quant_ssse3( qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } else { - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + // Maybe a more efficient way to store 0? + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } } @@ -452,10 +454,10 @@ void vp9_fdct8x8_quant_ssse3( } } else { do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; diff --git a/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c index 91d0602f9d..5930bf491e 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -13,7 +13,6 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" -#include "vpx_ports/emmintrin_compat.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_context_tree.h" diff --git a/libs/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/libs/libvpx/vp9/encoder/x86/vp9_error_avx2.c new file mode 100644 index 0000000000..99fef31d16 --- /dev/null +++ b/libs/libvpx/vp9/encoder/x86/vp9_error_avx2.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" + +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_256, ssz_256; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_hi, ssz_hi; + __m128i sse_128, ssz_128; + int64_t sse; + const __m256i zero = _mm256_setzero_si256(); + + // If the block size is 16 then the results will fit in 32 bits. + if (block_size == 16) { + __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi; + // Load 16 elements for coeff and dqcoeff. + coeff_256 = load_tran_low(coeff); + dqcoeff_256 = load_tran_low(dqcoeff); + // dqcoeff - coeff + dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256); + // madd (dqcoeff - coeff) + dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256); + // madd coeff + coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256); + // Save the higher 64 bit of each 128 bit lane. + dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8); + coeff_hi = _mm256_srli_si256(coeff_256, 8); + // Add the higher 64 bit to the low 64 bit. + dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi); + coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero); + ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero); + } else { + int i; + assert(block_size % 32 == 0); + sse_256 = zero; + ssz_256 = zero; + + for (i = 0; i < block_size; i += 32) { + __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1; + // Load 32 elements for coeff and dqcoeff. + coeff_0 = load_tran_low(coeff + i); + dqcoeff_0 = load_tran_low(dqcoeff + i); + coeff_1 = load_tran_low(coeff + i + 16); + dqcoeff_1 = load_tran_low(dqcoeff + i + 16); + // dqcoeff - coeff + dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0); + dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1); + // madd (dqcoeff - coeff) + dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0); + dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1); + // madd coeff + coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0); + coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1); + // Add the first madd (dqcoeff - coeff) with the second. + dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1); + // Add the first madd (coeff) with the second. + coeff_0 = _mm256_add_epi32(coeff_0, coeff_1); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero); + // Add each quad word of madd (dqcoeff - coeff) and madd (coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi); + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + ssz_hi = _mm256_srli_si256(ssz_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256), + _mm256_extractf128_si256(ssz_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)(&sse), sse_128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_128); + return sse; +} + +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int i; + const __m256i zero = _mm256_setzero_si256(); + __m256i sse_256 = zero; + __m256i sse_hi; + __m128i sse_128; + int64_t sse; + + if (block_size == 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + // dqcoeff - coeff + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // madd (dqcoeff - coeff) + const __m256i error_lo = _mm256_madd_epi16(diff, diff); + // Save the higher 64 bit of each 128 bit lane. + const __m256i error_hi = _mm256_srli_si256(error_lo, 8); + // Add the higher 64 bit to the low 64 bit. + const __m256i error = _mm256_add_epi32(error_lo, error_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(error, zero); + } else { + for (i = 0; i < block_size; i += 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + const __m256i error = _mm256_madd_epi16(diff, diff); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); + const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); + // Add each quad word of madd (dqcoeff - coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); + coeff += 16; + dqcoeff += 16; + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} diff --git a/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c deleted file mode 100644 index 453af2a403..0000000000 --- a/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Usee of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // AVX2 - -#include "./vp9_rtcd.h" -#include "vpx/vpx_integer.h" - -int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, - intptr_t block_size, int64_t *ssz) { - __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; - __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; - __m256i sse_reg_64hi, ssz_reg_64hi; - __m128i sse_reg128, ssz_reg128; - int64_t sse; - int i; - const __m256i zero_reg = _mm256_set1_epi16(0); - - // init sse and ssz registerd to zero - sse_reg = _mm256_set1_epi16(0); - ssz_reg = _mm256_set1_epi16(0); - - for (i = 0; i < block_size; i += 16) { - // load 32 bytes from coeff and dqcoeff - coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); - dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); - // dqcoeff - coeff - dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); - // madd (dqcoeff - coeff) - dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); - // madd coeff - coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); - // expand each double word of madd (dqcoeff - coeff) to quad word - exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); - exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); - // expand each double word of madd (coeff) to quad word - exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); - exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); - // add each quad word of madd (dqcoeff - coeff) and madd (coeff) - sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); - ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); - sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); - ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); - } - // save the higher 64 bit of each 128 bit lane - sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); - ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); - // add the higher 64 bit to the low 64 bit - sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); - ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); - - // add each 64 bit from each of the 128 bit lane of the 256 bit - sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), - _mm256_extractf128_si256(sse_reg, 1)); - - ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), - _mm256_extractf128_si256(ssz_reg, 1)); - - // store the results - _mm_storel_epi64((__m128i *)(&sse), sse_reg128); - - _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); - return sse; -} diff --git a/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm index 5b0238272b..11d473b2df 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm +++ b/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm @@ -11,6 +11,7 @@ %define private_prefix vp9 %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text @@ -22,14 +23,14 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz pxor m4, m4 ; sse accumulator pxor m6, m6 ; ssz accumulator pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*2] - lea dqcq, [dqcq+sizeq*2] - neg sizeq .loop: - mova m2, [uqcq+sizeq*2] - mova m0, [dqcq+sizeq*2] - mova m3, [uqcq+sizeq*2+mmsize] - mova m1, [dqcq+sizeq*2+mmsize] + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 psubw m0, m2 psubw m1, m3 ; individual errors are max. 15bit+sign, so squares are 30bit, and @@ -38,25 +39,19 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 ; accumulate in 64bit punpckldq m7, m0, m5 punpckhdq m0, m5 paddq m4, m7 - punpckldq m7, m1, m5 - paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m7 punpckldq m7, m2, m5 - paddq m4, m1 + paddq m4, m0 punpckhdq m2, m5 paddq m6, m7 - punpckldq m7, m3, m5 paddq m6, m2 - punpckhdq m3, m5 - paddq m6, m7 - paddq m6, m3 - add sizeq, mmsize - jl .loop + jg .loop ; accumulate horizontally and store in return value movhlps m5, m4 @@ -75,39 +70,37 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz %endif RET -; Compute the sum of squared difference between two int16_t vectors. -; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff, +; Compute the sum of squared difference between two tran_low_t vectors. +; Vectors are converted (if necessary) to int16_t for calculations. +; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff, ; intptr_t block_size) INIT_XMM sse2 cglobal block_error_fp, 3, 3, 6, uqc, dqc, size pxor m4, m4 ; sse accumulator pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*2] - lea dqcq, [dqcq+sizeq*2] - neg sizeq .loop: - mova m2, [uqcq+sizeq*2] - mova m0, [dqcq+sizeq*2] - mova m3, [uqcq+sizeq*2+mmsize] - mova m1, [dqcq+sizeq*2+mmsize] + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 psubw m0, m2 psubw m1, m3 ; individual errors are max. 15bit+sign, so squares are 30bit, and ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) pmaddwd m0, m0 pmaddwd m1, m1 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 ; accumulate in 64bit punpckldq m3, m0, m5 punpckhdq m0, m5 paddq m4, m3 - punpckldq m3, m1, m5 paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m3 - paddq m4, m1 - add sizeq, mmsize - jl .loop + jnz .loop ; accumulate horizontally and store in return value movhlps m5, m4 diff --git a/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c index fa2a6449b0..7685e7bc3e 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -13,192 +13,895 @@ #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_scale/yv12config.h" -extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst); +static INLINE __m128i scale_plane_2_to_1_phase_0_kernel( + const uint8_t *const src, const __m128i *const mask) { + const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); + const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); + const __m128i a_and = _mm_and_si128(a, *mask); + const __m128i b_and = _mm_and_si128(b, *mask); + return _mm_packus_epi16(a_and, b_and); +} -static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int w, - int h) { +static void scale_plane_2_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; const __m128i mask = _mm_set1_epi16(0x00FF); - const int max_width = w & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); + _mm_storeu_si128((__m128i *)dst, d); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi32(0x000000FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); + const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); + const __m128i d2 = _mm_packus_epi16(d0, d1); + _mm_storeu_si128((__m128i *)dst, d2); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s, + const __m128i c0c1) { + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); + const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); + // round and shift by 7 bit each 16 bit + const __m128i t2 = _mm_adds_epi16(t0, k_64); + const __m128i t3 = _mm_adds_epi16(t1, k_64); + const __m128i t4 = _mm_srai_epi16(t2, 7); + const __m128i t5 = _mm_srai_epi16(t3, 7); + return _mm_packus_epi16(t4, t5); +} + +static void scale_plane_2_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[2], d[2]; + + // Horizontal + // Even rows + s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + // odd rows + s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + d[1] = scale_plane_bilinear_kernel(s, c0c1); + + // Vertical + s[0] = _mm_unpacklo_epi8(d[0], d[1]); + s[1] = _mm_unpackhi_epi8(d[0], d[1]); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[8], d[8]; + + // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. + // Here we tried to not use shuffle instructions which would be slow + // on some x86 CPUs. + + // Horizontal + // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx + // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx + // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx + // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx + // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx + // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx + // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx + // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx + s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); + s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); + s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); + s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); + s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); + s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); + + // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx + // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx + // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx + // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx + // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx + // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx + // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx + // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx + d[0] = _mm_unpacklo_epi16(s[0], s[4]); + d[1] = _mm_unpackhi_epi16(s[0], s[4]); + d[2] = _mm_unpacklo_epi16(s[1], s[5]); + d[3] = _mm_unpackhi_epi16(s[1], s[5]); + d[4] = _mm_unpacklo_epi16(s[2], s[6]); + d[5] = _mm_unpackhi_epi16(s[2], s[6]); + d[6] = _mm_unpacklo_epi16(s[3], s[7]); + d[7] = _mm_unpackhi_epi16(s[3], s[7]); + + // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx + // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx + // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx + // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx + // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx + // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx + // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx + // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx + s[0] = _mm_unpacklo_epi32(d[0], d[1]); + s[1] = _mm_unpackhi_epi32(d[0], d[1]); + s[2] = _mm_unpacklo_epi32(d[2], d[3]); + s[3] = _mm_unpackhi_epi32(d[2], d[3]); + s[4] = _mm_unpacklo_epi32(d[4], d[5]); + s[5] = _mm_unpackhi_epi32(d[4], d[5]); + s[6] = _mm_unpacklo_epi32(d[6], d[7]); + s[7] = _mm_unpackhi_epi32(d[6], d[7]); + + // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D + // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D + // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D + // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D + d[0] = _mm_unpacklo_epi32(s[0], s[1]); + d[1] = _mm_unpacklo_epi32(s[2], s[3]); + d[2] = _mm_unpacklo_epi32(s[4], s[5]); + d[3] = _mm_unpacklo_epi32(s[6], s[7]); + + d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); + d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); + + // Vertical + d[0] = scale_plane_bilinear_kernel(d, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + do { + load_8bit_8x8(src + 2, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[3]); + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + transpose_16bit_4x8(&s[3], &s[3]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 + d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 + d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + d[0] = _mm_packus_epi16(d[0], d[2]); + d[1] = _mm_packus_epi16(d[1], d[3]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + d[2] = _mm_unpacklo_epi16(d[0], d[1]); + d[3] = _mm_unpackhi_epi16(d[0], d[1]); + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + d[0] = _mm_unpacklo_epi32(d[2], d[3]); + d[1] = _mm_unpackhi_epi32(d[2], d[3]); + store_8bit_8x4_from_16x2(d, t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + t += 8; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); + t += 6 * width_hor; + y = height_ver; + + do { + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[3]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 + d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 + d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[1] = _mm_packus_epi16(d[2], d[3]); + store_8bit_8x4_from_16x2(d, dst, dst_stride); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + do { + load_8bit_8x8(src + 4, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[2]); + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + transpose_16bit_4x8(&s[2], &s[2]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 + + // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + d[0] = _mm_packus_epi16(d[0], d[0]); + d[1] = _mm_packus_epi16(d[1], d[1]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + d[0] = _mm_unpacklo_epi16(d[0], d[1]); + store_8bit_4x4_sse2(d[0], t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + + t += 4; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + t += 4 * width_hor; + y = height_ver; + + do { + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[2]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + d[0] = _mm_packus_epi16(d[0], d[1]); + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + + s[0] = s[4]; + s[1] = s[5]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase_scaler + 1 * step_q4; + const int offset2_q4 = phase_scaler + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs shuffle_filter_funcs[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs convolve8_funcs[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); + shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, + const __m128i *const f) { + __m128i ss[4], temp; + + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + temp = convolve8_8_ssse3(ss, f); + return _mm_packus_epi16(temp, temp); +} + +// Only calculate odd columns since even columns are just src pixels' copies. +static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, + const int w, const __m128i *const f) { + int x = w; + + do { + __m128i s[8], temp; + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); + s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); + s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); + s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); + s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); + s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); + s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); + temp = scale_1_to_2_phase_0_kernel(s, f); + _mm_storel_epi64((__m128i *)dst, temp); + src += 8; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_1_to_2_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int src_w, const int src_h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + int max_width; int y; - for (y = 0; y < h; ++y) { + uint8_t *tmp[9]; + __m128i f[4]; + + max_width = (src_w + 7) & ~7; + tmp[0] = temp_buffer + 0 * max_width; + tmp[1] = temp_buffer + 1 * max_width; + tmp[2] = temp_buffer + 2 * max_width; + tmp[3] = temp_buffer + 3 * max_width; + tmp[4] = temp_buffer + 4 * max_width; + tmp[5] = temp_buffer + 5 * max_width; + tmp[6] = temp_buffer + 6 * max_width; + tmp[7] = temp_buffer + 7 * max_width; + + shuffle_filter_ssse3(coef, f); + + scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); + scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); + scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); + scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); + scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); + scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); + scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); + + y = src_h; + do { int x; - for (x = 0; x < max_width; x += 16) { - const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0)); - const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16)); - const __m128i a_and = _mm_and_si128(a, mask); - const __m128i b_and = _mm_and_si128(b, mask); - const __m128i c = _mm_packus_epi16(a_and, b_and); - _mm_storeu_si128((__m128i *)(dst + x), c); + scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); + for (x = 0; x < max_width; x += 8) { + __m128i s[8], C, D, CD; + + // Even rows + const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); + const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + const __m128i ab = _mm_unpacklo_epi8(a, b); + _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); + + // Odd rows + // Even columns + load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); + C = scale_1_to_2_phase_0_kernel(s, f); + + // Odd columns + s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); + s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); + s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); + s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); + s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); + s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); + s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); + D = scale_1_to_2_phase_0_kernel(s, f); + + CD = _mm_unpacklo_epi8(C, D); + _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); } - for (; x < w; ++x) dst[x] = src[x * 2]; - src += src_stride * 2; - dst += dst_stride; - } -} -static INLINE __m128i filter(const __m128i *const a, const __m128i *const b, - const __m128i *const c, const __m128i *const d, - const __m128i *const e, const __m128i *const f, - const __m128i *const g, const __m128i *const h) { - const __m128i coeffs_ab = - _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1); - const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78, - -19, 78, -19, 78, -19, 78, -19); - const __m128i const64_x16 = _mm_set1_epi16(64); - const __m128i ab = _mm_unpacklo_epi8(*a, *b); - const __m128i cd = _mm_unpacklo_epi8(*c, *d); - const __m128i fe = _mm_unpacklo_epi8(*f, *e); - const __m128i hg = _mm_unpacklo_epi8(*h, *g); - const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab); - const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd); - const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd); - const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab); - // can not overflow - const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms); - // can not overflow - const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms); - // can overflow, use saturating add - const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms); - const __m128i round = _mm_adds_epi16(terms, const64_x16); - const __m128i shift = _mm_srai_epi16(round, 7); - return _mm_packus_epi16(shift, shift); -} - -static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) { - const int max_width = w & ~7; - int x = 0; - for (; x < max_width; x += 8) { - const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0)); - const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1)); - const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2)); - const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3)); - const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4)); - const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5)); - const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6)); - const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7)); - const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h); - _mm_storel_epi64((__m128i *)(dst + x), pack); - } -} - -static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int dst_w, - int dst_h) { - dst_w /= 2; - dst_h /= 2; - { - DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]); - uint8_t *tmp0 = tmp + dst_w * 0; - uint8_t *tmp1 = tmp + dst_w * 1; - uint8_t *tmp2 = tmp + dst_w * 2; - uint8_t *tmp3 = tmp + dst_w * 3; - uint8_t *tmp4 = tmp + dst_w * 4; - uint8_t *tmp5 = tmp + dst_w * 5; - uint8_t *tmp6 = tmp + dst_w * 6; - uint8_t *tmp7 = tmp + dst_w * 7; - uint8_t *tmp8 = NULL; - const int max_width = dst_w & ~7; - int y; - eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w); - eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w); - eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w); - eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w); - eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w); - eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w); - eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w); - for (y = 0; y < dst_h; y++) { - int x; - eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w); - for (x = 0; x < max_width; x += 8) { - const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x)); - const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x)); - const __m128i AB = _mm_unpacklo_epi8(A, B); - __m128i C, D, CD; - _mm_storeu_si128((__m128i *)(dst + x * 2), AB); - { - const __m128i a = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3)); - const __m128i b = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2)); - const __m128i c = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1)); - const __m128i d = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0)); - const __m128i e = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1)); - const __m128i f = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2)); - const __m128i g = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3)); - const __m128i h = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4)); - C = filter(&a, &b, &c, &d, &e, &f, &g, &h); - } - { - const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x)); - const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x)); - const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x)); - const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x)); - const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x)); - const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x)); - const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x)); - const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x)); - D = filter(&a, &b, &c, &d, &e, &f, &g, &h); - } - CD = _mm_unpacklo_epi8(C, D); - _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD); - } - src += src_stride; - dst += dst_stride * 2; - tmp8 = tmp0; - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = tmp6; - tmp6 = tmp7; - tmp7 = tmp8; - } - } + src += src_stride; + dst += 2 * dst_stride; + tmp[8] = tmp[0]; + tmp[0] = tmp[1]; + tmp[1] = tmp[2]; + tmp[2] = tmp[3]; + tmp[3] = tmp[4]; + tmp[4] = tmp[5]; + tmp[5] = tmp[6]; + tmp[6] = tmp[7]; + tmp[7] = tmp[8]; + } while (--y); } void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { + YV12_BUFFER_CONFIG *dst, + uint8_t filter_type, int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; const int dst_uv_w = dst_w / 2; const int dst_uv_h = dst_h / 2; + int scaled = 0; + + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); if (dst_w * 2 == src_w && dst_h * 2 == src_h) { - downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, - dst->y_stride, dst_w, dst_h); - downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - vpx_extend_frame_borders(dst); - } else if (dst_w == src_w * 2 && dst_h == src_h * 2) { - // The upsample() supports widths up to 1920 * 2. If greater, fall back - // to vp9_scale_and_extend_frame_c(). - if (dst_w / 2 <= 1920) { - upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, - dst->y_stride, dst_w, dst_h); - upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - vpx_extend_frame_borders(dst); + // 2 to 1 + scaled = 1; + + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); } else { - vp9_scale_and_extend_frame_c(src, dst); + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + } else { + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow + const int extra_padding = 16; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_w + 7) & ~7; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read overflow. + // For example, the horizontal filter generates 18 pixels but the vertical + // filter reads 24 pixels in a row. The difference is multiplied by 2 since + // two rows are interlaced together in the optimization. + const int extra_padding = (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (temp_buffer) { + scaled = 1; + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + free(temp_buffer); + } + } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { + // 1 to 2 + uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7)); + if (temp_buffer) { + scaled = 1; + scale_plane_1_to_2_phase_0( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w, + src_h, vp9_filter_kernels[filter_type][8], temp_buffer); + scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src_w / 2, src_h / 2, + vp9_filter_kernels[filter_type][8], + temp_buffer); + scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src_w / 2, src_h / 2, + vp9_filter_kernels[filter_type][8], + temp_buffer); + free(temp_buffer); + } + } + + if (scaled) { + vpx_extend_frame_borders(dst); } else { - vp9_scale_and_extend_frame_c(src, dst); + // Call c version for all other scaling ratios. + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); } } diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm b/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm deleted file mode 100644 index e476323e14..0000000000 --- a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm +++ /dev/null @@ -1,261 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text -ALIGN 16 - -; -; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, -; intptr_t block_size, int64_t *ssz) -; - -INIT_XMM avx -cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz - vzeroupper - - ; If only one iteration is required, then handle this as a special case. - ; It is the most frequent case, so we can have a significant gain here - ; by not setting up a loop and accumulators. - cmp sizeq, 16 - jne .generic - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Common case of size == 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Load input vectors - mova xm0, [dqcq] - packssdw xm0, [dqcq+16] - mova xm2, [uqcq] - packssdw xm2, [uqcq+16] - - mova xm1, [dqcq+32] - packssdw xm1, [dqcq+48] - mova xm3, [uqcq+32] - packssdw xm3, [uqcq+48] - - ; Compute the errors. - psubw xm0, xm2 - psubw xm1, xm3 - - ; Individual errors are max 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). - pmaddwd xm2, xm2 - pmaddwd xm3, xm3 - - pmaddwd xm0, xm0 - pmaddwd xm1, xm1 - - ; Squares are always positive, so we can use unsigned arithmetic after - ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will - ; fit in 32bits - paddd xm2, xm3 - paddd xm0, xm1 - - ; Accumulate horizontally in 64 bits, there is no chance of overflow here - pxor xm5, xm5 - - pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits - psrlq xm2, 32 ; Zero extended high of a pair of 32 bits - - pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits - psrlq xm0, 32 ; Zero extended high of a pair of 32 bits - - paddq xm2, xm3 - paddq xm0, xm1 - - psrldq xm3, xm2, 8 - psrldq xm1, xm0, 8 - - paddq xm2, xm3 - paddq xm0, xm1 - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm0 - movq [sszq], xm2 -%else - movd eax, xm0 - pextrd edx, xm0, 1 - movq [sszd], xm2 -%endif - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of size != 16, speculative low precision - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ALIGN 16 -.generic: - pxor xm4, xm4 ; sse accumulator - pxor xm5, xm5 ; overflow detection register for xm4 - pxor xm6, xm6 ; ssz accumulator - pxor xm7, xm7 ; overflow detection register for xm6 - lea uqcq, [uqcq+sizeq*4] - lea dqcq, [dqcq+sizeq*4] - neg sizeq - - ; Push the negative size as the high precision code might need it - push sizeq - -.loop: - ; Load input vectors - mova xm0, [dqcq+sizeq*4] - packssdw xm0, [dqcq+sizeq*4+16] - mova xm2, [uqcq+sizeq*4] - packssdw xm2, [uqcq+sizeq*4+16] - - mova xm1, [dqcq+sizeq*4+32] - packssdw xm1, [dqcq+sizeq*4+48] - mova xm3, [uqcq+sizeq*4+32] - packssdw xm3, [uqcq+sizeq*4+48] - - add sizeq, 16 - - ; Compute the squared errors. - ; Individual errors are max 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). - psubw xm0, xm2 - pmaddwd xm2, xm2 - pmaddwd xm0, xm0 - - psubw xm1, xm3 - pmaddwd xm3, xm3 - pmaddwd xm1, xm1 - - ; Squares are always positive, so we can use unsigned arithmetic after - ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will - ; fit in 32bits - paddd xm2, xm3 - paddd xm0, xm1 - - ; We accumulate using 32 bit arithmetic, but detect potential overflow - ; by checking if the MSB of the accumulators have ever been a set bit. - ; If yes, we redo the whole compute at the end on higher precision, but - ; this happens extremely rarely, so we still achieve a net gain. - paddd xm4, xm0 - paddd xm6, xm2 - por xm5, xm4 ; OR in the accumulator for overflow detection - por xm7, xm6 ; OR in the accumulator for overflow detection - - jnz .loop - - ; Add pairs horizontally (still only on 32 bits) - phaddd xm4, xm4 - por xm5, xm4 ; OR in the accumulator for overflow detection - phaddd xm6, xm6 - por xm7, xm6 ; OR in the accumulator for overflow detection - - ; Check for possibility of overflow by testing if bit 32 of each dword lane - ; have ever been set. If they were not, then there was no overflow and the - ; final sum will fit in 32 bits. If overflow happened, then - ; we redo the whole computation on higher precision. - por xm7, xm5 - pmovmskb r4, xm7 - test r4, 0x8888 - jnz .highprec - - phaddd xm4, xm4 - phaddd xm6, xm6 - pmovzxdq xm4, xm4 - pmovzxdq xm6, xm6 - - ; Restore stack - pop sizeq - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm4 - movq [sszq], xm6 -%else - movd eax, xm4 - pextrd edx, xm4, 1 - movq [sszd], xm6 -%endif - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of size != 16, high precision case - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -.highprec: - pxor xm4, xm4 ; sse accumulator - pxor xm5, xm5 ; dedicated zero register - pxor xm6, xm6 ; ssz accumulator - pop sizeq - -.loophp: - mova xm0, [dqcq+sizeq*4] - packssdw xm0, [dqcq+sizeq*4+16] - mova xm2, [uqcq+sizeq*4] - packssdw xm2, [uqcq+sizeq*4+16] - - mova xm1, [dqcq+sizeq*4+32] - packssdw xm1, [dqcq+sizeq*4+48] - mova xm3, [uqcq+sizeq*4+32] - packssdw xm3, [uqcq+sizeq*4+48] - - add sizeq, 16 - - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - - psubw xm0, xm2 - pmaddwd xm2, xm2 - pmaddwd xm0, xm0 - - psubw xm1, xm3 - pmaddwd xm3, xm3 - pmaddwd xm1, xm1 - - ; accumulate in 64bit - punpckldq xm7, xm0, xm5 - punpckhdq xm0, xm5 - paddq xm4, xm7 - - punpckldq xm7, xm2, xm5 - punpckhdq xm2, xm5 - paddq xm6, xm7 - - punpckldq xm7, xm1, xm5 - punpckhdq xm1, xm5 - paddq xm4, xm7 - - punpckldq xm7, xm3, xm5 - punpckhdq xm3, xm5 - paddq xm6, xm7 - - paddq xm4, xm0 - paddq xm4, xm1 - paddq xm6, xm2 - paddq xm6, xm3 - - jnz .loophp - - ; Accumulate horizontally - movhlps xm5, xm4 - movhlps xm7, xm6 - paddq xm4, xm5 - paddq xm6, xm7 - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm4 - movq [sszq], xm6 -%else - movd eax, xm4 - pextrd edx, xm4, 1 - movq [sszd], xm6 -%endif - RET - -END diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm deleted file mode 100644 index f3b8f01947..0000000000 --- a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm +++ /dev/null @@ -1,98 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text -ALIGN 16 - -; -; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, -; intptr_t block_size, int64_t *ssz) -; - -INIT_XMM sse2 -cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz - pxor m4, m4 ; sse accumulator - pxor m6, m6 ; ssz accumulator - pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*4] - lea dqcq, [dqcq+sizeq*4] - neg sizeq - - ALIGN 16 - -.loop: - mova m0, [dqcq+sizeq*4] - packssdw m0, [dqcq+sizeq*4+mmsize] - mova m2, [uqcq+sizeq*4] - packssdw m2, [uqcq+sizeq*4+mmsize] - - mova m1, [dqcq+sizeq*4+mmsize*2] - packssdw m1, [dqcq+sizeq*4+mmsize*3] - mova m3, [uqcq+sizeq*4+mmsize*2] - packssdw m3, [uqcq+sizeq*4+mmsize*3] - - add sizeq, mmsize - - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - - psubw m0, m2 - pmaddwd m2, m2 - pmaddwd m0, m0 - - psubw m1, m3 - pmaddwd m3, m3 - pmaddwd m1, m1 - - ; accumulate in 64bit - punpckldq m7, m0, m5 - punpckhdq m0, m5 - paddq m4, m7 - - punpckldq m7, m2, m5 - punpckhdq m2, m5 - paddq m6, m7 - - punpckldq m7, m1, m5 - punpckhdq m1, m5 - paddq m4, m7 - - punpckldq m7, m3, m5 - punpckhdq m3, m5 - paddq m6, m7 - - paddq m4, m0 - paddq m4, m1 - paddq m6, m2 - paddq m6, m3 - - jnz .loop - - ; accumulate horizontally and store in return value - movhlps m5, m4 - movhlps m7, m6 - paddq m4, m5 - paddq m6, m7 - -%if ARCH_X86_64 - movq rax, m4 - movq [sszq], m6 -%else - mov eax, sszm - pshufd m5, m4, 0x1 - movq [eax], m6 - movd eax, m4 - movd edx, m5 -%endif - RET diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c index 3f8ee5f244..ca0ad4407e 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -8,25 +8,30 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" -void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; __m128i thr; int16_t nzflag; + __m128i eob; + __m128i round, quant, dequant; + (void)scan_ptr; - (void)zbin_ptr; - (void)quant_shift_ptr; + (void)skip_block; + assert(!skip_block); coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; @@ -35,40 +40,106 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant; + { + __m128i coeff0, coeff1; + + // Setup global values { - __m128i coeff0, coeff1; + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + } - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + // Do DC and first 15 AC + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); // Reinsert signs @@ -77,134 +148,55 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); + } else { + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } - n_coeffs += 8 * 2; } - thr = _mm_srai_epi16(dequant, 1); - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } else { - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; + if (nzflag) { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); } + n_coeffs += 8 * 2; + } - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); } } diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index ec61c0c3a7..5703aa3bb6 100644 --- a/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -11,6 +11,7 @@ %define private_prefix vp9 %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION_RODATA pw_1: times 8 dw 1 @@ -18,17 +19,14 @@ pw_1: times 8 dw 1 SECTION .text %macro QUANTIZE_FP 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \ + qcoeff, dqcoeff, dequant, \ eob, scan, iscan - cmp dword skipm, 0 - jne .blank ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp mov r2, dequantmp - movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp mova m1, [roundq] ; m1 = round @@ -48,15 +46,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif pxor m5, m5 ; m5 = dedicated zero - lea coeffq, [ coeffq+ncoeffq*2] - lea r5q, [ r5q+ncoeffq*2] - lea r3q, [ r3q+ncoeffq*2] - lea r4q, [r4q+ncoeffq*2] + INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq + lea r5q, [r5q+ncoeffq*2] + INCREMENT_ELEMENTS_TRAN_LOW r3q, ncoeffq + INCREMENT_ELEMENTS_TRAN_LOW r4q, ncoeffq neg ncoeffq ; get DC and first 15 AC coeffs - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] + LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpeqw m7, m7 @@ -69,8 +67,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign - mova [r3q+ncoeffq*2+ 0], m8 - mova [r3q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 8, r3q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 %ifidn %1, fp_32x32 pabsw m8, m8 pabsw m13, m13 @@ -87,8 +85,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %else psrlw m0, m3, 1 %endif - mova [r4q+ncoeffq*2+ 0], m8 - mova [r4q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 8, r4q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] @@ -102,8 +100,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jz .accumulate_eob .ac_only_loop: - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] + LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) @@ -123,8 +121,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign - mova [r3q+ncoeffq*2+ 0], m14 - mova [r3q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 14, r3q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 %ifidn %1, fp_32x32 pabsw m14, m14 pabsw m13, m13 @@ -137,8 +135,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m14, m9 psignw m13, m10 %endif - mova [r4q+ncoeffq*2+ 0], m14 - mova [r4q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 14, r4q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] @@ -154,10 +152,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jmp .accumulate_eob .skip_iter: - mova [r3q+ncoeffq*2+ 0], m5 - mova [r3q+ncoeffq*2+16], m5 - mova [r4q+ncoeffq*2+ 0], m5 - mova [r4q+ncoeffq*2+16], m5 + STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8 + STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8 add ncoeffq, mmsize jl .ac_only_loop @@ -171,28 +169,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pshuflw m7, m8, 0x1 pmaxsw m8, m7 pextrw r6, m8, 0 - mov [r2], r6 - RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - - lea r0q, [r0q+ncoeffq*2] - lea r2q, [r2q+ncoeffq*2] - neg ncoeffq - pxor m7, m7 -.blank_loop: - mova [r0q+ncoeffq*2+ 0], m7 - mova [r0q+ncoeffq*2+16], m7 - mova [r2q+ncoeffq*2+ 0], m7 - mova [r2q+ncoeffq*2+16], m7 - add ncoeffq, mmsize - jl .blank_loop - mov word [r3q], 0 + mov [r2], r6w RET %endmacro diff --git a/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm deleted file mode 100644 index 21aaa93831..0000000000 --- a/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm +++ /dev/null @@ -1,212 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -; void vp9_temporal_filter_apply_sse2 | arg -; (unsigned char *frame1, | 0 -; unsigned int stride, | 1 -; unsigned char *frame2, | 2 -; unsigned int block_width, | 3 -; unsigned int block_height, | 4 -; int strength, | 5 -; int filter_weight, | 6 -; unsigned int *accumulator, | 7 -; unsigned short *count) | 8 -global sym(vp9_temporal_filter_apply_sse2) PRIVATE -sym(vp9_temporal_filter_apply_sse2): - - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ALIGN_STACK 16, rax - %define block_width 0 - %define block_height 16 - %define strength 32 - %define filter_weight 48 - %define rounding_bit 64 - %define rbp_backup 80 - %define stack_size 96 - sub rsp, stack_size - mov [rsp + rbp_backup], rbp - ; end prolog - - mov edx, arg(3) - mov [rsp + block_width], rdx - mov edx, arg(4) - mov [rsp + block_height], rdx - movd xmm6, arg(5) - movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read - - ; calculate the rounding bit outside the loop - ; 0x8000 >> (16 - strength) - mov rdx, 16 - sub rdx, arg(5) ; 16 - strength - movq xmm4, rdx ; can't use rdx w/ shift - movdqa xmm5, [GLOBAL(_const_top_bit)] - psrlw xmm5, xmm4 - movdqa [rsp + rounding_bit], xmm5 - - mov rsi, arg(0) ; src/frame1 - mov rdx, arg(2) ; predictor frame - mov rdi, arg(7) ; accumulator - mov rax, arg(8) ; count - - ; dup the filter weight and store for later - movd xmm0, arg(6) ; filter_weight - pshuflw xmm0, xmm0, 0 - punpcklwd xmm0, xmm0 - movdqa [rsp + filter_weight], xmm0 - - mov rbp, arg(1) ; stride - pxor xmm7, xmm7 ; zero for extraction - - mov rcx, [rsp + block_width] - imul rcx, [rsp + block_height] - add rcx, rdx - cmp dword ptr [rsp + block_width], 8 - jne .temporal_filter_apply_load_16 - -.temporal_filter_apply_load_8: - movq xmm0, [rsi] ; first row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm0, xmm7 ; src[ 0- 7] - movq xmm1, [rsi] ; second row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm1, xmm7 ; src[ 8-15] - jmp .temporal_filter_apply_load_finished - -.temporal_filter_apply_load_16: - movdqa xmm0, [rsi] ; src (frame1) - lea rsi, [rsi + rbp] ; += stride - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; src[ 0- 7] - punpckhbw xmm1, xmm7 ; src[ 8-15] - -.temporal_filter_apply_load_finished: - movdqa xmm2, [rdx] ; predictor (frame2) - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm7 ; pred[ 0- 7] - punpckhbw xmm3, xmm7 ; pred[ 8-15] - - ; modifier = src_byte - pixel_value - psubw xmm0, xmm2 ; src - pred[ 0- 7] - psubw xmm1, xmm3 ; src - pred[ 8-15] - - ; modifier *= modifier - pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 - pmullw xmm1, xmm1 ; modifer[ 8-15]^2 - - ; modifier *= 3 - pmullw xmm0, [GLOBAL(_const_3w)] - pmullw xmm1, [GLOBAL(_const_3w)] - - ; modifer += 0x8000 >> (16 - strength) - paddw xmm0, [rsp + rounding_bit] - paddw xmm1, [rsp + rounding_bit] - - ; modifier >>= strength - psrlw xmm0, [rsp + strength] - psrlw xmm1, [rsp + strength] - - ; modifier = 16 - modifier - ; saturation takes care of modifier > 16 - movdqa xmm3, [GLOBAL(_const_16w)] - movdqa xmm2, [GLOBAL(_const_16w)] - psubusw xmm3, xmm1 - psubusw xmm2, xmm0 - - ; modifier *= filter_weight - pmullw xmm2, [rsp + filter_weight] - pmullw xmm3, [rsp + filter_weight] - - ; count - movdqa xmm4, [rax] - movdqa xmm5, [rax+16] - ; += modifier - paddw xmm4, xmm2 - paddw xmm5, xmm3 - ; write back - movdqa [rax], xmm4 - movdqa [rax+16], xmm5 - lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) - - ; load and extract the predictor up to shorts - pxor xmm7, xmm7 - movdqa xmm0, [rdx] - lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; pred[ 0- 7] - punpckhbw xmm1, xmm7 ; pred[ 8-15] - - ; modifier *= pixel_value - pmullw xmm0, xmm2 - pmullw xmm1, xmm3 - - ; expand to double words - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm7 ; [ 0- 3] - punpckhwd xmm2, xmm7 ; [ 4- 7] - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm7 ; [ 8-11] - punpckhwd xmm3, xmm7 ; [12-15] - - ; accumulator - movdqa xmm4, [rdi] - movdqa xmm5, [rdi+16] - movdqa xmm6, [rdi+32] - movdqa xmm7, [rdi+48] - ; += modifier - paddd xmm4, xmm0 - paddd xmm5, xmm2 - paddd xmm6, xmm1 - paddd xmm7, xmm3 - ; write back - movdqa [rdi], xmm4 - movdqa [rdi+16], xmm5 - movdqa [rdi+32], xmm6 - movdqa [rdi+48], xmm7 - lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) - - cmp rdx, rcx - je .temporal_filter_apply_epilog - pxor xmm7, xmm7 ; zero for extraction - cmp dword ptr [rsp + block_width], 16 - je .temporal_filter_apply_load_16 - jmp .temporal_filter_apply_load_8 - -.temporal_filter_apply_epilog: - ; begin epilog - mov rbp, [rsp + rbp_backup] - add rsp, stack_size - pop rsp - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -_const_3w: - times 8 dw 3 -align 16 -_const_top_bit: - times 8 dw 1<<15 -align 16 -_const_16w - times 8 dw 16 diff --git a/libs/libvpx/vp9/vp9_cx_iface.c b/libs/libvpx/vp9/vp9_cx_iface.c index a797b2c262..881caae78b 100644 --- a/libs/libvpx/vp9/vp9_cx_iface.c +++ b/libs/libvpx/vp9/vp9_cx_iface.c @@ -51,6 +51,8 @@ struct vp9_extracfg { vpx_color_range_t color_range; int render_width; int render_height; + unsigned int row_mt; + unsigned int motion_vector_unit_test; }; static struct vp9_extracfg default_extra_cfg = { @@ -82,6 +84,8 @@ static struct vp9_extracfg default_extra_cfg = { 0, // color range 0, // render width 0, // render height + 0, // row_mt + 0, // motion_vector_unit_test }; struct vpx_codec_alg_priv { @@ -157,6 +161,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); RANGE_CHECK_BOOL(extra_cfg, lossless); + RANGE_CHECK_BOOL(extra_cfg, frame_parallel_decoding_mode); RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2); RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1); RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1); @@ -166,12 +171,17 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); + RANGE_CHECK(cfg, rc_2pass_vbr_corpus_complexity, 0, 10000); RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); RANGE_CHECK_BOOL(cfg, rc_resize_allowed); RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100); RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100); +#if CONFIG_REALTIME_ONLY + RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS); +#else RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS); +#endif RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1)); RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1)); if (extra_cfg->max_gf_interval > 0) { @@ -182,6 +192,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, (MAX_LAG_BUFFERS - 1)); } + // For formation of valid ARF groups lag_in _frames should be 0 or greater + // than the max_gf_interval + 2 + if (cfg->g_lag_in_frames > 0 && extra_cfg->max_gf_interval > 0 && + cfg->g_lag_in_frames < extra_cfg->max_gf_interval + 2) { + ERROR("Set lag in frames to 0 (low delay) or >= (max-gf-interval + 2)"); + } + if (cfg->rc_resize_allowed == 1) { RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w); RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h); @@ -197,7 +214,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 && level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 && level != LEVEL_6_1 && level != LEVEL_6_2 && level != LEVEL_UNKNOWN && - level != LEVEL_MAX) + level != LEVEL_AUTO && level != LEVEL_MAX) ERROR("target_level is invalid"); } @@ -244,6 +261,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, "kf_min_dist not supported in auto mode, use 0 " "or kf_max_dist instead."); + RANGE_CHECK(extra_cfg, row_mt, 0, 1); + RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2); RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2); RANGE_CHECK(extra_cfg, cpu_used, -8, 8); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); @@ -262,6 +281,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, if (extra_cfg->tuning == VP8_TUNE_SSIM) ERROR("Option --tune=ssim is not currently supported in VP9."); +#if !CONFIG_REALTIME_ONLY if (cfg->g_pass == VPX_RC_LAST_PASS) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); @@ -313,6 +333,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("rc_twopass_stats_in missing EOS stats packet"); } } +#endif // !CONFIG_REALTIME_ONLY #if !CONFIG_VP9_HIGHBITDEPTH if (cfg->g_profile > (unsigned int)PROFILE_1) { @@ -389,6 +410,60 @@ static int get_image_bps(const vpx_image_t *img) { return 0; } +// Modify the encoder config for the target level. +static void config_target_level(VP9EncoderConfig *oxcf) { + double max_average_bitrate; // in bits per second + int max_over_shoot_pct; + const int target_level_index = get_level_index(oxcf->target_level); + + vpx_clear_system_state(); + assert(target_level_index >= 0); + assert(target_level_index < VP9_LEVELS); + + // Maximum target bit-rate is level_limit * 80%. + max_average_bitrate = + vp9_level_defs[target_level_index].average_bitrate * 800.0; + if ((double)oxcf->target_bandwidth > max_average_bitrate) + oxcf->target_bandwidth = (int64_t)(max_average_bitrate); + if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) + oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; + + // Adjust max over-shoot percentage. + max_over_shoot_pct = + (int)((max_average_bitrate * 1.10 - (double)oxcf->target_bandwidth) * + 100 / (double)(oxcf->target_bandwidth)); + if (oxcf->over_shoot_pct > max_over_shoot_pct) + oxcf->over_shoot_pct = max_over_shoot_pct; + + // Adjust worst allowed quantizer. + oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63); + + // Adjust minimum art-ref distance. + // min_gf_interval should be no less than min_altref_distance + 1, + // as the encoder may produce bitstream with alt-ref distance being + // min_gf_interval - 1. + if (oxcf->min_gf_interval <= + (int)vp9_level_defs[target_level_index].min_altref_distance) { + oxcf->min_gf_interval = + (int)vp9_level_defs[target_level_index].min_altref_distance + 1; + // If oxcf->max_gf_interval == 0, it will be assigned with a default value + // in vp9_rc_set_gf_interval_range(). + if (oxcf->max_gf_interval != 0) { + oxcf->max_gf_interval = + VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval); + } + } + + // Adjust maximum column tiles. + if (vp9_level_defs[target_level_index].max_col_tiles < + (1 << oxcf->tile_columns)) { + while (oxcf->tile_columns > 0 && + vp9_level_defs[target_level_index].max_col_tiles < + (1 << oxcf->tile_columns)) + --oxcf->tile_columns; + } +} + static vpx_codec_err_t set_encoder_config( VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { @@ -452,6 +527,7 @@ static vpx_codec_err_t set_encoder_config( oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct; oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct; oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct; + oxcf->vbr_corpus_complexity = cfg->rc_2pass_vbr_corpus_complexity; oxcf->auto_key = cfg->kf_mode == VPX_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist; @@ -509,6 +585,9 @@ static vpx_codec_err_t set_encoder_config( oxcf->target_level = extra_cfg->target_level; + oxcf->row_mt = extra_cfg->row_mt; + oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { #if CONFIG_SPATIAL_SVC oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl]; @@ -532,6 +611,8 @@ static vpx_codec_err_t set_encoder_config( } else if (oxcf->ts_number_layers == 1) { oxcf->ts_rate_decimator[0] = 1; } + + if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf); /* printf("Current VP9 Settings: \n"); printf("target_bandwidth: %d\n", oxcf->target_bandwidth); @@ -557,6 +638,7 @@ static vpx_codec_err_t set_encoder_config( printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); + printf("vbr_corpus_complexity: %d\n", oxcf->vbr_corpus_complexity); printf("lag_in_frames: %d\n", oxcf->lag_in_frames); printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf); printf("Version: %d\n", oxcf->Version); @@ -795,6 +877,21 @@ static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.row_mt = CAST(VP9E_SET_ROW_MT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_enable_motion_vector_unit_test( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.motion_vector_unit_test = + CAST(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return VPX_CODEC_INVALID_PARAM; @@ -817,12 +914,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, priv->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); if (priv->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) { - return VPX_CODEC_MEM_ERROR; - } -#endif - if (ctx->config.enc) { // Update the reference to the config structure to an internal copy. priv->cfg = *ctx->config.enc; @@ -854,9 +945,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { free(ctx->cx_data); vp9_remove_compressor(ctx->cpi); -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); -#endif vpx_free(ctx->buffer_pool); vpx_free(ctx); return VPX_CODEC_OK; @@ -867,6 +955,10 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, unsigned long deadline) { MODE new_mode = BEST; +#if CONFIG_REALTIME_ONLY + (void)duration; + deadline = VPX_DL_REALTIME; +#else switch (ctx->cfg.g_pass) { case VPX_RC_ONE_PASS: if (deadline > 0) { @@ -887,6 +979,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, case VPX_RC_FIRST_PASS: break; case VPX_RC_LAST_PASS: new_mode = deadline > 0 ? GOOD : BEST; break; } +#endif // CONFIG_REALTIME_ONLY if (deadline == VPX_DL_REALTIME) { ctx->oxcf.pass = 0; @@ -1002,6 +1095,28 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; + if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 && + !cpi->level_constraint.rc_config_updated) { + SVC *const svc = &cpi->svc; + const int is_two_pass_svc = + (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS *stats = &twopass->total_stats; + if (is_two_pass_svc) { + const double frame_rate = 10000000.0 * stats->count / stats->duration; + vp9_update_spatial_layer_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * + svc->layer_context[svc->spatial_layer_id].target_bandwidth / + 10000000.0); + } else { + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); + } + cpi->level_constraint.rc_config_updated = 1; + } + if (img != NULL) { res = validate_img(ctx, img); if (res == VPX_CODEC_OK) { @@ -1039,7 +1154,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } cpi->common.error.setjmp = 1; - vp9_apply_encoding_flags(cpi, flags); + if (res == VPX_CODEC_OK) vp9_apply_encoding_flags(cpi, flags); // Handle fixed keyframe intervals if (ctx->cfg.kf_mode == VPX_KF_AUTO && @@ -1173,8 +1288,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data += size; cx_data_sz -= size; -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) -#if CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) { vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr; int sl; @@ -1194,7 +1308,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr); } -#endif #endif if (is_one_pass_cbr_svc(cpi) && (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { @@ -1367,6 +1480,9 @@ static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { cfg->ss_number_layers > 1 && cfg->ts_number_layers > 1) { return VPX_CODEC_INVALID_PARAM; } + + vp9_set_row_mt(ctx->cpi); + return VPX_CODEC_OK; } @@ -1525,6 +1641,8 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config }, { VP9E_SET_RENDER_SIZE, ctrl_set_render_size }, { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level }, + { VP9E_SET_ROW_MT, ctrl_set_row_mt }, + { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, @@ -1581,6 +1699,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 50, // rc_two_pass_vbrbias 0, // rc_two_pass_vbrmin_section 2000, // rc_two_pass_vbrmax_section + 0, // rc_2pass_vbr_corpus_complexity (non 0 for corpus vbr) // keyframing settings (kf) VPX_KF_AUTO, // g_kfmode diff --git a/libs/libvpx/vp9/vp9_dx_iface.c b/libs/libvpx/vp9/vp9_dx_iface.c index 3b5dc3ddac..657490f4bd 100644 --- a/libs/libvpx/vp9/vp9_dx_iface.c +++ b/libs/libvpx/vp9/vp9_dx_iface.c @@ -47,12 +47,6 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, ctx->priv->init_flags = ctx->init_flags; priv->si.sz = sizeof(priv->si); priv->flushed = 0; - // Only do frame parallel decode when threads > 1. - priv->frame_parallel_decode = - (ctx->config.dec && (ctx->config.dec->threads > 1) && - (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) - ? 1 - : 0; if (ctx->config.dec) { priv->cfg = *ctx->config.dec; ctx->config.dec = &priv->cfg; @@ -63,33 +57,8 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, } static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { - if (ctx->frame_workers != NULL) { - int i; - // Shutdown all threads before reclaiming any memory. The frame-level - // parallel decoder may access data from another worker. - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - vpx_get_worker_interface()->end(worker); - } - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - vp9_remove_common(&frame_worker_data->pbi->common); -#if CONFIG_VP9_POSTPROC - vp9_free_postproc_buffers(&frame_worker_data->pbi->common); -#endif - vp9_decoder_remove(frame_worker_data->pbi); - vpx_free(frame_worker_data->scratch_buffer); -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&frame_worker_data->stats_mutex); - pthread_cond_destroy(&frame_worker_data->stats_cond); -#endif - vpx_free(frame_worker_data); - } -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); -#endif + if (ctx->pbi != NULL) { + vp9_decoder_remove(ctx->pbi); } if (ctx->buffer_pool) { @@ -97,7 +66,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); } - vpx_free(ctx->frame_workers); vpx_free(ctx->buffer_pool); vpx_free(ctx); return VPX_CODEC_OK; @@ -231,32 +199,26 @@ static vpx_codec_err_t update_error_state( } static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { - int i; + VP9_COMMON *const cm = &ctx->pbi->common; + BufferPool *const pool = cm->buffer_pool; - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - BufferPool *const pool = cm->buffer_pool; + cm->new_fb_idx = INVALID_IDX; + cm->byte_alignment = ctx->byte_alignment; + cm->skip_loop_filter = ctx->skip_loop_filter; - cm->new_fb_idx = INVALID_IDX; - cm->byte_alignment = ctx->byte_alignment; - cm->skip_loop_filter = ctx->skip_loop_filter; + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + pool->get_fb_cb = ctx->get_ext_fb_cb; + pool->release_fb_cb = ctx->release_ext_fb_cb; + pool->cb_priv = ctx->ext_priv; + } else { + pool->get_fb_cb = vp9_get_frame_buffer; + pool->release_fb_cb = vp9_release_frame_buffer; - if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { - pool->get_fb_cb = ctx->get_ext_fb_cb; - pool->release_fb_cb = ctx->release_ext_fb_cb; - pool->cb_priv = ctx->ext_priv; - } else { - pool->get_fb_cb = vp9_get_frame_buffer; - pool->release_fb_cb = vp9_release_frame_buffer; + if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); - if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to initialize internal frame buffers"); - - pool->cb_priv = &pool->int_frame_buffers; - } + pool->cb_priv = &pool->int_frame_buffers; } } @@ -273,124 +235,21 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { flags->noise_level = ctx->postproc_cfg.noise_level; } -static int frame_worker_hook(void *arg1, void *arg2) { - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1; - const uint8_t *data = frame_worker_data->data; - (void)arg2; - - frame_worker_data->result = vp9_receive_compressed_data( - frame_worker_data->pbi, frame_worker_data->data_size, &data); - frame_worker_data->data_end = data; - - if (frame_worker_data->pbi->frame_parallel_decode) { - // In frame parallel decoding, a worker thread must successfully decode all - // the compressed data. - if (frame_worker_data->result != 0 || - frame_worker_data->data + frame_worker_data->data_size - 1 > data) { - VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner; - BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool; - // Signal all the other threads that are waiting for this frame. - vp9_frameworker_lock_stats(worker); - frame_worker_data->frame_context_ready = 1; - lock_buffer_pool(pool); - frame_worker_data->pbi->cur_buf->buf.corrupted = 1; - unlock_buffer_pool(pool); - frame_worker_data->pbi->need_resync = 1; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - return 0; - } - } else if (frame_worker_data->result != 0) { - // Check decode result in serial decode. - frame_worker_data->pbi->cur_buf->buf.corrupted = 1; - frame_worker_data->pbi->need_resync = 1; - } - return !frame_worker_data->result; -} - static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { - int i; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - ctx->last_show_frame = -1; - ctx->next_submit_worker_id = 0; - ctx->last_submit_worker_id = 0; - ctx->next_output_worker_id = 0; - ctx->frame_cache_read = 0; - ctx->frame_cache_write = 0; - ctx->num_cache_frames = 0; ctx->need_resync = 1; - ctx->num_frame_workers = - (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1; - if (ctx->num_frame_workers > MAX_DECODE_THREADS) - ctx->num_frame_workers = MAX_DECODE_THREADS; - ctx->available_threads = ctx->num_frame_workers; ctx->flushed = 0; ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { - set_error_detail(ctx, "Failed to allocate buffer pool mutex"); + ctx->pbi = vp9_decoder_create(ctx->buffer_pool); + if (ctx->pbi == NULL) { + set_error_detail(ctx, "Failed to allocate decoder"); return VPX_CODEC_MEM_ERROR; } -#endif - - ctx->frame_workers = (VPxWorker *)vpx_malloc(ctx->num_frame_workers * - sizeof(*ctx->frame_workers)); - if (ctx->frame_workers == NULL) { - set_error_detail(ctx, "Failed to allocate frame_workers"); - return VPX_CODEC_MEM_ERROR; - } - - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *frame_worker_data = NULL; - winterface->init(worker); - worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData)); - if (worker->data1 == NULL) { - set_error_detail(ctx, "Failed to allocate frame_worker_data"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool); - if (frame_worker_data->pbi == NULL) { - set_error_detail(ctx, "Failed to allocate frame_worker_data"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data->pbi->frame_worker_owner = worker; - frame_worker_data->worker_id = i; - frame_worker_data->scratch_buffer = NULL; - frame_worker_data->scratch_buffer_size = 0; - frame_worker_data->frame_context_ready = 0; - frame_worker_data->received_frame = 0; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) { - set_error_detail(ctx, "Failed to allocate frame_worker_data mutex"); - return VPX_CODEC_MEM_ERROR; - } - - if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) { - set_error_detail(ctx, "Failed to allocate frame_worker_data cond"); - return VPX_CODEC_MEM_ERROR; - } -#endif - // If decoding in serial mode, FrameWorker thread could create tile worker - // thread or loopfilter thread. - frame_worker_data->pbi->max_threads = - (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0; - - frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; - frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode; - frame_worker_data->pbi->common.frame_parallel_decode = - ctx->frame_parallel_decode; - worker->hook = (VPxWorkerHook)frame_worker_hook; - if (!winterface->reset(worker)) { - set_error_detail(ctx, "Frame Worker thread creation failed"); - return VPX_CODEC_MEM_ERROR; - } - } + ctx->pbi->max_threads = ctx->cfg.threads; + ctx->pbi->inv_tile_order = ctx->invert_tile_order; // If postprocessing was enabled by the application and a // configuration has not been provided, default it. @@ -404,7 +263,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, const VP9Decoder *const pbi) { - // Clear resync flag if worker got a key frame or intra only frame. + // Clear resync flag if the decoder got a key frame or intra only frame. if (ctx->need_resync == 1 && pbi->need_resync == 0 && (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME)) ctx->need_resync = 0; @@ -413,7 +272,6 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, void *user_priv, int64_t deadline) { - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); (void)deadline; // Determine the stream parameters. Note that we rely on peek_si to @@ -429,103 +287,25 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR; } - if (!ctx->frame_parallel_decode) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->data = *data; - frame_worker_data->data_size = data_sz; - frame_worker_data->user_priv = user_priv; - frame_worker_data->received_frame = 1; + ctx->user_priv = user_priv; - // Set these even if already initialized. The caller may have changed the - // decrypt config between frames. - frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb; - frame_worker_data->pbi->decrypt_state = ctx->decrypt_state; + // Set these even if already initialized. The caller may have changed the + // decrypt config between frames. + ctx->pbi->decrypt_cb = ctx->decrypt_cb; + ctx->pbi->decrypt_state = ctx->decrypt_state; - worker->had_error = 0; - winterface->execute(worker); - - // Update data pointer after decode. - *data = frame_worker_data->data_end; - - if (worker->had_error) - return update_error_state(ctx, &frame_worker_data->pbi->common.error); - - check_resync(ctx, frame_worker_data->pbi); - } else { - VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - // Copy context from last worker thread to next worker thread. - if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) - vp9_frameworker_copy_context( - &ctx->frame_workers[ctx->next_submit_worker_id], - &ctx->frame_workers[ctx->last_submit_worker_id]); - - frame_worker_data->pbi->ready_for_new_data = 0; - // Copy the compressed data into worker's internal buffer. - // TODO(hkuang): Will all the workers allocate the same size - // as the size of the first intra frame be better? This will - // avoid too many deallocate and allocate. - if (frame_worker_data->scratch_buffer_size < data_sz) { - frame_worker_data->scratch_buffer = - (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz); - if (frame_worker_data->scratch_buffer == NULL) { - set_error_detail(ctx, "Failed to reallocate scratch buffer"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data->scratch_buffer_size = data_sz; - } - frame_worker_data->data_size = data_sz; - memcpy(frame_worker_data->scratch_buffer, *data, data_sz); - - frame_worker_data->frame_decoded = 0; - frame_worker_data->frame_context_ready = 0; - frame_worker_data->received_frame = 1; - frame_worker_data->data = frame_worker_data->scratch_buffer; - frame_worker_data->user_priv = user_priv; - - if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) - ctx->last_submit_worker_id = - (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers; - - ctx->next_submit_worker_id = - (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers; - --ctx->available_threads; - worker->had_error = 0; - winterface->launch(worker); + if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) { + ctx->pbi->cur_buf->buf.corrupted = 1; + ctx->pbi->need_resync = 1; + ctx->need_resync = 1; + return update_error_state(ctx, &ctx->pbi->common.error); } + check_resync(ctx, ctx->pbi); + return VPX_CODEC_OK; } -static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) { - YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = { 0, 0, 0 }; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - ctx->next_output_worker_id = - (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; - // TODO(hkuang): Add worker error handling here. - winterface->sync(worker); - frame_worker_data->received_frame = 0; - ++ctx->available_threads; - - check_resync(ctx, frame_worker_data->pbi); - - if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx; - yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd, - frame_worker_data->user_priv); - ctx->frame_cache[ctx->frame_cache_write].img.fb_priv = - frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; - ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE; - ++ctx->num_cache_frames; - } -} - static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { @@ -543,8 +323,8 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Reset flushed when receiving a valid frame. ctx->flushed = 0; - // Initialize the decoder workers on the first frame. - if (ctx->frame_workers == NULL) { + // Initialize the decoder on the first frame. + if (ctx->pbi == NULL) { const vpx_codec_err_t res = init_decoder(ctx); if (res != VPX_CODEC_OK) return res; } @@ -553,91 +333,40 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, ctx->decrypt_cb, ctx->decrypt_state); if (res != VPX_CODEC_OK) return res; - if (ctx->frame_parallel_decode) { - // Decode in frame parallel mode. When decoding in this mode, the frame - // passed to the decoder must be either a normal frame or a superframe with - // superframe index so the decoder could get each frame's start position - // in the superframe. - if (frame_count > 0) { - int i; + if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1) + frame_count = ctx->svc_spatial_layer + 1; - for (i = 0; i < frame_count; ++i) { - const uint8_t *data_start_copy = data_start; - const uint32_t frame_size = frame_sizes[i]; - if (data_start < data || - frame_size > (uint32_t)(data_end - data_start)) { - set_error_detail(ctx, "Invalid frame size in index"); - return VPX_CODEC_CORRUPT_FRAME; - } + // Decode in serial mode. + if (frame_count > 0) { + int i; - if (ctx->available_threads == 0) { - // No more threads for decoding. Wait until the next output worker - // finishes decoding. Then copy the decoded frame into cache. - if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { - wait_worker_and_cache_frame(ctx); - } else { - // TODO(hkuang): Add unit test to test this path. - set_error_detail(ctx, "Frame output cache is full."); - return VPX_CODEC_ERROR; - } - } - - res = - decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; - data_start += frame_size; - } - } else { - if (ctx->available_threads == 0) { - // No more threads for decoding. Wait until the next output worker - // finishes decoding. Then copy the decoded frame into cache. - if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { - wait_worker_and_cache_frame(ctx); - } else { - // TODO(hkuang): Add unit test to test this path. - set_error_detail(ctx, "Frame output cache is full."); - return VPX_CODEC_ERROR; - } + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + vpx_codec_err_t res; + if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) { + set_error_detail(ctx, "Invalid frame size in index"); + return VPX_CODEC_CORRUPT_FRAME; } - res = decode_one(ctx, &data, data_sz, user_priv, deadline); + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); if (res != VPX_CODEC_OK) return res; + + data_start += frame_size; } } else { - // Decode in serial mode. - if (frame_count > 0) { - int i; + while (data_start < data_end) { + const uint32_t frame_size = (uint32_t)(data_end - data_start); + const vpx_codec_err_t res = + decode_one(ctx, &data_start, frame_size, user_priv, deadline); + if (res != VPX_CODEC_OK) return res; - for (i = 0; i < frame_count; ++i) { - const uint8_t *data_start_copy = data_start; - const uint32_t frame_size = frame_sizes[i]; - vpx_codec_err_t res; - if (data_start < data || - frame_size > (uint32_t)(data_end - data_start)) { - set_error_detail(ctx, "Invalid frame size in index"); - return VPX_CODEC_CORRUPT_FRAME; - } - - res = - decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; - - data_start += frame_size; - } - } else { + // Account for suboptimal termination by the encoder. while (data_start < data_end) { - const uint32_t frame_size = (uint32_t)(data_end - data_start); - const vpx_codec_err_t res = - decode_one(ctx, &data_start, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; - - // Account for suboptimal termination by the encoder. - while (data_start < data_end) { - const uint8_t marker = - read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start); - if (marker) break; - ++data_start; - } + const uint8_t marker = + read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start); + if (marker) break; + ++data_start; } } } @@ -645,80 +374,28 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, return res; } -static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) { - RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs; - // Decrease reference count of last output frame in frame parallel mode. - if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) { - BufferPool *const pool = ctx->buffer_pool; - lock_buffer_pool(pool); - decrease_ref_count(ctx->last_show_frame, frame_bufs, pool); - unlock_buffer_pool(pool); - } -} - static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter) { vpx_image_t *img = NULL; - // Only return frame when all the cpu are busy or - // application fluhsed the decoder in frame parallel decode. - if (ctx->frame_parallel_decode && ctx->available_threads > 0 && - !ctx->flushed) { - return NULL; - } + // Legacy parameter carried over from VP8. Has no effect for VP9 since we + // always return only 1 frame per decode call. + (void)iter; - // Output the frames in the cache first. - if (ctx->num_cache_frames > 0) { - release_last_output_frame(ctx); - ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx; - if (ctx->need_resync) return NULL; - img = &ctx->frame_cache[ctx->frame_cache_read].img; - ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE; - --ctx->num_cache_frames; - return img; - } - - // iter acts as a flip flop, so an image is only returned on the first - // call to get_frame. - if (*iter == NULL && ctx->frame_workers != NULL) { - do { - YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = { 0, 0, 0 }; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - ctx->next_output_worker_id = - (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; - if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) - set_ppflags(ctx, &flags); - // Wait for the frame from worker thread. - if (winterface->sync(worker)) { - // Check if worker has received any frames. - if (frame_worker_data->received_frame == 1) { - ++ctx->available_threads; - frame_worker_data->received_frame = 0; - check_resync(ctx, frame_worker_data->pbi); - } - if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - release_last_output_frame(ctx); - ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx; - if (ctx->need_resync) return NULL; - yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv); - ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; - img = &ctx->img; - return img; - } - } else { - // Decoding failed. Release the worker thread. - frame_worker_data->received_frame = 0; - ++ctx->available_threads; - ctx->need_resync = 1; - if (ctx->flushed != 1) return NULL; - } - } while (ctx->next_output_worker_id != ctx->next_submit_worker_id); + if (ctx->pbi != NULL) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = { 0, 0, 0 }; + if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags); + if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) { + VP9_COMMON *const cm = &ctx->pbi->common; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + ctx->last_show_frame = ctx->pbi->common.new_fb_idx; + if (ctx->need_resync) return NULL; + yuvconfig2image(&ctx->img, &sd, ctx->user_priv); + ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + img = &ctx->img; + return img; + } } return NULL; } @@ -728,7 +405,7 @@ static vpx_codec_err_t decoder_set_fb_fn( vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { if (cb_get == NULL || cb_release == NULL) { return VPX_CODEC_INVALID_PARAM; - } else if (ctx->frame_workers == NULL) { + } else if (ctx->pbi == NULL) { // If the decoder has already been initialized, do not accept changes to // the frame buffer functions. ctx->get_ext_fb_cb = cb_get; @@ -744,21 +421,12 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); - return vp9_set_reference_dec(&frame_worker_data->pbi->common, - ref_frame_to_vp9_reframe(frame->frame_type), - &sd); + return vp9_set_reference_dec( + &ctx->pbi->common, ref_frame_to_vp9_reframe(frame->frame_type), &sd); } else { return VPX_CODEC_INVALID_PARAM; } @@ -768,20 +436,12 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); - return vp9_copy_reference_dec(frame_worker_data->pbi, - (VP9_REFFRAME)frame->frame_type, &sd); + return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type, + &sd); } else { return VPX_CODEC_INVALID_PARAM; } @@ -791,17 +451,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { YV12_BUFFER_CONFIG *fb; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx); + fb = get_ref_frame(&ctx->pbi->common, data->idx); if (fb == NULL) return VPX_CODEC_ERROR; yuvconfig2image(&data->img, fb, NULL); return VPX_CODEC_OK; @@ -829,22 +481,21 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, #endif } +static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL || ctx->pbi == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = ctx->pbi->common.base_qindex; + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, va_list args) { int *const update_info = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (update_info) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - *update_info = frame_worker_data->pbi->refresh_frame_flags; + if (ctx->pbi != NULL) { + *update_info = ctx->pbi->refresh_frame_flags; return VPX_CODEC_OK; } else { return VPX_CODEC_ERROR; @@ -859,14 +510,9 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, int *corrupted = va_arg(args, int *); if (corrupted) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - RefCntBuffer *const frame_bufs = - frame_worker_data->pbi->common.buffer_pool->frame_bufs; - if (frame_worker_data->pbi->common.frame_to_show == NULL) - return VPX_CODEC_ERROR; + if (ctx->pbi != NULL) { + RefCntBuffer *const frame_bufs = ctx->pbi->common.buffer_pool->frame_bufs; + if (ctx->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR; if (ctx->last_show_frame >= 0) *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; return VPX_CODEC_OK; @@ -882,18 +528,9 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx, va_list args) { int *const frame_size = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (frame_size) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; frame_size[0] = cm->width; frame_size[1] = cm->height; return VPX_CODEC_OK; @@ -909,18 +546,9 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx, va_list args) { int *const render_size = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (render_size) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; render_size[0] = cm->render_width; render_size[1] = cm->render_height; return VPX_CODEC_OK; @@ -935,13 +563,10 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx, va_list args) { unsigned int *const bit_depth = va_arg(args, unsigned int *); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; if (bit_depth) { - if (worker) { - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; *bit_depth = cm->bit_depth; return VPX_CODEC_OK; } else { @@ -980,10 +605,8 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; ctx->byte_alignment = byte_alignment; - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi->common.byte_alignment = byte_alignment; + if (ctx->pbi != NULL) { + ctx->pbi->common.byte_alignment = byte_alignment; } return VPX_CODEC_OK; } @@ -992,15 +615,23 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx, va_list args) { ctx->skip_loop_filter = va_arg(args, int); - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter; + if (ctx->pbi != NULL) { + ctx->pbi->common.skip_loop_filter = ctx->skip_loop_filter; } return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->svc_decoding = 1; + ctx->svc_spatial_layer = va_arg(args, int); + if (ctx->svc_spatial_layer < 0) + return VPX_CODEC_INVALID_PARAM; + else + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -1011,8 +642,10 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VPXD_SET_DECRYPTOR, ctrl_set_decryptor }, { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, + { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc }, // Getters + { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer }, { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates }, { VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted }, { VP9_GET_REFERENCE, ctrl_get_reference }, @@ -1029,7 +662,10 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { CODEC_INTERFACE(vpx_codec_vp9_dx) = { "WebM Project VP9 Decoder" VERSION_STRING, VPX_CODEC_INTERNAL_ABI_VERSION, - VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC | +#if CONFIG_VP9_HIGHBITDEPTH + VPX_CODEC_CAP_HIGHBITDEPTH | +#endif + VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC | VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // vpx_codec_caps_t decoder_init, // vpx_codec_init_fn_t decoder_destroy, // vpx_codec_destroy_fn_t diff --git a/libs/libvpx/vp9/vp9_dx_iface.h b/libs/libvpx/vp9/vp9_dx_iface.h index cc3d51842a..18bc7ab0d6 100644 --- a/libs/libvpx/vp9/vp9_dx_iface.h +++ b/libs/libvpx/vp9/vp9_dx_iface.h @@ -15,19 +15,12 @@ typedef vpx_codec_stream_info_t vp9_stream_info_t; -// This limit is due to framebuffer numbers. -// TODO(hkuang): Remove this limit after implementing ondemand framebuffers. -#define FRAME_CACHE_SIZE 6 // Cache maximum 6 decoded frames. - -typedef struct cache_frame { - int fb_idx; - vpx_image_t img; -} cache_frame; - struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_dec_cfg_t cfg; vp9_stream_info_t si; + VP9Decoder *pbi; + void *user_priv; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; vpx_decrypt_cb decrypt_cb; @@ -40,26 +33,18 @@ struct vpx_codec_alg_priv { int byte_alignment; int skip_loop_filter; - // Frame parallel related. - int frame_parallel_decode; // frame-based threading. - VPxWorker *frame_workers; - int num_frame_workers; - int next_submit_worker_id; - int last_submit_worker_id; - int next_output_worker_id; - int available_threads; - cache_frame frame_cache[FRAME_CACHE_SIZE]; - int frame_cache_write; - int frame_cache_read; - int num_cache_frames; int need_resync; // wait for key/intra-only frame - // BufferPool that holds all reference frames. Shared by all the FrameWorkers. + // BufferPool that holds all reference frames. BufferPool *buffer_pool; // External frame buffer info to save for VP9 common. void *ext_priv; // Private data associated with the external frame buffers. vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb; vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb; + + // Allow for decoding up to a given spatial layer for SVC stream. + int svc_decoding; + int svc_spatial_layer; }; #endif // VP9_VP9_DX_IFACE_H_ diff --git a/libs/libvpx/vp9/vp9cx.mk b/libs/libvpx/vp9/vp9cx.mk index a8ca0d5935..d633ed1429 100644 --- a/libs/libvpx/vp9/vp9cx.mk +++ b/libs/libvpx/vp9/vp9cx.mk @@ -39,9 +39,13 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h VP9_CX_SRCS-yes += encoder/vp9_encodemv.h VP9_CX_SRCS-yes += encoder/vp9_extend.h VP9_CX_SRCS-yes += encoder/vp9_firstpass.h +VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c +VP9_CX_SRCS-yes += encoder/vp9_job_queue.h VP9_CX_SRCS-yes += encoder/vp9_lookahead.c VP9_CX_SRCS-yes += encoder/vp9_lookahead.h VP9_CX_SRCS-yes += encoder/vp9_mcomp.h +VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c +VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h VP9_CX_SRCS-yes += encoder/vp9_encoder.h VP9_CX_SRCS-yes += encoder/vp9_quantize.h VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h @@ -96,7 +100,8 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c + VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) @@ -104,12 +109,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm -ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm -VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm -else VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -endif ifeq ($(ARCH_X86_64),yes) VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -117,20 +117,20 @@ endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c -endif ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c endif -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c endif +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c @@ -138,6 +138,11 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c + +# Strip unnecessary files with CONFIG_REALTIME_ONLY +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/libs/libvpx/vp9/vp9dx.mk b/libs/libvpx/vp9/vp9dx.mk index 4c6fd00715..59f612b94c 100644 --- a/libs/libvpx/vp9/vp9dx.mk +++ b/libs/libvpx/vp9/vp9dx.mk @@ -24,8 +24,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.c VP9_DX_SRCS-yes += decoder/vp9_decodemv.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.h -VP9_DX_SRCS-yes += decoder/vp9_dthread.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.h VP9_DX_SRCS-yes += decoder/vp9_decoder.c VP9_DX_SRCS-yes += decoder/vp9_decoder.h VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c diff --git a/libs/libvpx/vpx/src/svc_encodeframe.c b/libs/libvpx/vpx/src/svc_encodeframe.c index 5aa0b8ddb8..f633600c79 100644 --- a/libs/libvpx/vpx/src/svc_encodeframe.c +++ b/libs/libvpx/vpx/src/svc_encodeframe.c @@ -53,6 +53,10 @@ static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = { 4, 5, 7, 11, static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = { 16, 16, 16, 16, 16 }; +static const int DEFAULT_SCALE_FACTORS_NUM_2x[VPX_SS_MAX_LAYERS] = { 1, 2, 4 }; + +static const int DEFAULT_SCALE_FACTORS_DEN_2x[VPX_SS_MAX_LAYERS] = { 4, 4, 4 }; + typedef enum { QUANTIZER = 0, BITRATE, @@ -127,9 +131,9 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt, static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input, int *value0, int *value1) { if (type == SCALE_FACTOR) { - *value0 = strtol(input, &input, 10); + *value0 = (int)strtol(input, &input, 10); if (*input++ != '/') return VPX_CODEC_INVALID_PARAM; - *value1 = strtol(input, &input, 10); + *value1 = (int)strtol(input, &input, 10); if (*value0 < option_min_values[SCALE_FACTOR] || *value1 < option_min_values[SCALE_FACTOR] || @@ -156,6 +160,9 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, char *token; const char *delim = ","; char *save_ptr; + int num_layers = svc_ctx->spatial_layers; + if (type == BITRATE) + num_layers = svc_ctx->spatial_layers * svc_ctx->temporal_layers; if (input == NULL || option0 == NULL || (option1 == NULL && type == SCALE_FACTOR)) @@ -163,7 +170,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, input_string = strdup(input); token = strtok_r(input_string, delim, &save_ptr); - for (i = 0; i < svc_ctx->spatial_layers; ++i) { + for (i = 0; i < num_layers; ++i) { if (token != NULL) { res = extract_option(type, token, option0 + i, option1 + i); if (res != VPX_CODEC_OK) break; @@ -172,11 +179,11 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, break; } } - if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) { + if (res == VPX_CODEC_OK && i != num_layers) { svc_log(svc_ctx, SVC_LOG_ERROR, "svc: layer params type: %d %d values required, " "but only %d specified\n", - type, svc_ctx->spatial_layers, i); + type, num_layers, i); res = VPX_CODEC_INVALID_PARAM; } free(input_string); @@ -194,7 +201,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { char *input_string; char *option_name; char *option_value; - char *input_ptr; + char *input_ptr = NULL; SvcInternal_t *const si = get_svc_internal(svc_ctx); vpx_codec_err_t res = VPX_CODEC_OK; int i, alt_ref_enabled = 0; @@ -287,24 +294,30 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) { return VPX_CODEC_OK; } -void assign_layer_bitrates(const SvcContext *svc_ctx, - vpx_codec_enc_cfg_t *const enc_cfg) { +vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx, + vpx_codec_enc_cfg_t *const enc_cfg) { int i; const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); int sl, tl, spatial_layer_target; if (svc_ctx->temporal_layering_mode != 0) { if (si->bitrates[0] != 0) { - enc_cfg->rc_target_bitrate = 0; + unsigned int total_bitrate = 0; for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { - enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] = 0; + total_bitrate += si->bitrates[sl * svc_ctx->temporal_layers + + svc_ctx->temporal_layers - 1]; for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] += (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl]; enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + tl] = si->bitrates[sl * svc_ctx->temporal_layers + tl]; + if (tl > 0 && (si->bitrates[sl * svc_ctx->temporal_layers + tl] <= + si->bitrates[sl * svc_ctx->temporal_layers + tl - 1])) + return VPX_CODEC_INVALID_PARAM; } } + if (total_bitrate != enc_cfg->rc_target_bitrate) + return VPX_CODEC_INVALID_PARAM; } else { float total = 0; float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; @@ -341,11 +354,14 @@ void assign_layer_bitrates(const SvcContext *svc_ctx, } } else { if (si->bitrates[0] != 0) { - enc_cfg->rc_target_bitrate = 0; + unsigned int total_bitrate = 0; for (i = 0; i < svc_ctx->spatial_layers; ++i) { enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i]; - enc_cfg->rc_target_bitrate += si->bitrates[i]; + enc_cfg->layer_target_bitrate[i] = (unsigned int)si->bitrates[i]; + total_bitrate += si->bitrates[i]; } + if (total_bitrate != enc_cfg->rc_target_bitrate) + return VPX_CODEC_INVALID_PARAM; } else { float total = 0; float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; @@ -368,6 +384,7 @@ void assign_layer_bitrates(const SvcContext *svc_ctx, } } } + return VPX_CODEC_OK; } vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, @@ -412,12 +429,28 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl]; si->svc_params.speed_per_layer[sl] = svc_ctx->speed; } - + if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS && + svc_ctx->spatial_layers <= 3) { + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + int sl2 = (svc_ctx->spatial_layers == 2) ? sl + 1 : sl; + si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2]; + si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2]; + } + if (svc_ctx->spatial_layers == 1) { + si->svc_params.scaling_factor_num[0] = 1; + si->svc_params.scaling_factor_den[0] = 1; + } + } for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { i = sl * svc_ctx->temporal_layers + tl; si->svc_params.max_quantizers[i] = MAX_QUANTIZER; si->svc_params.min_quantizers[i] = 0; + if (enc_cfg->rc_end_usage == VPX_CBR && + enc_cfg->g_pass == VPX_RC_ONE_PASS) { + si->svc_params.max_quantizers[i] = 56; + si->svc_params.min_quantizers[i] = 2; + } } } @@ -442,7 +475,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, (int)VPX_MAX_LAYERS); return VPX_CODEC_INVALID_PARAM; } - assign_layer_bitrates(svc_ctx, enc_cfg); + res = assign_layer_bitrates(svc_ctx, enc_cfg); + if (res != VPX_CODEC_OK) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "layer bitrates incorrect: \n" + "1) spatial layer bitrates should sum up to target \n" + "2) temporal layer bitrates should be increasing within \n" + "a spatial layer \n"); + return VPX_CODEC_INVALID_PARAM; + } #if CONFIG_SPATIAL_SVC for (i = 0; i < svc_ctx->spatial_layers; ++i) @@ -518,8 +559,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, iter = NULL; while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { switch (cx_pkt->kind) { -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) -#if CONFIG_SPATIAL_SVC +#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: { int i; for (i = 0; i < svc_ctx->spatial_layers; ++i) { @@ -553,10 +593,9 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, si->bytes_sum[i] += cx_pkt->data.layer_sizes[i]; break; } -#endif #endif case VPX_CODEC_PSNR_PKT: { -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) +#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) int j; svc_log(svc_ctx, SVC_LOG_DEBUG, "frame: %d, layer: %d, PSNR(Total/Y/U/V): " diff --git a/libs/libvpx/vpx/src/vpx_encoder.c b/libs/libvpx/vpx/src/vpx_encoder.c index 4390cf7c8f..1cf2dca695 100644 --- a/libs/libvpx/vpx/src/vpx_encoder.c +++ b/libs/libvpx/vpx/src/vpx_encoder.c @@ -12,8 +12,11 @@ * \brief Provides the high level interface to wrap encoder algorithms. * */ +#include #include +#include #include +#include "vp8/common/blockd.h" #include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" @@ -81,6 +84,8 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( int i; void *mem_loc = NULL; + if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE; + if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) { for (i = 0; i < num_enc; i++) { vpx_codec_priv_enc_mr_cfg_t mr_cfg; @@ -89,28 +94,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || dsf->den > dsf->num) { res = VPX_CODEC_INVALID_PARAM; - break; + } else { + mr_cfg.mr_low_res_mode_info = mem_loc; + mr_cfg.mr_total_resolutions = num_enc; + mr_cfg.mr_encoder_id = num_enc - 1 - i; + mr_cfg.mr_down_sampling_factor.num = dsf->num; + mr_cfg.mr_down_sampling_factor.den = dsf->den; + + /* Force Key-frame synchronization. Namely, encoder at higher + * resolution always use the same frame_type chosen by the + * lowest-resolution encoder. + */ + if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; + + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, &mr_cfg); } - mr_cfg.mr_low_res_mode_info = mem_loc; - mr_cfg.mr_total_resolutions = num_enc; - mr_cfg.mr_encoder_id = num_enc - 1 - i; - mr_cfg.mr_down_sampling_factor.num = dsf->num; - mr_cfg.mr_down_sampling_factor.den = dsf->den; - - /* Force Key-frame synchronization. Namely, encoder at higher - * resolution always use the same frame_type chosen by the - * lowest-resolution encoder. - */ - if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; - - ctx->iface = iface; - ctx->name = iface->name; - ctx->priv = NULL; - ctx->init_flags = flags; - ctx->config.enc = cfg; - res = ctx->iface->init(ctx, &mr_cfg); - if (res) { const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL; /* Destroy current ctx */ @@ -124,10 +128,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( vpx_codec_destroy(ctx); i--; } +#if CONFIG_MULTI_RES_ENCODING + assert(mem_loc); + free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info); + free(mem_loc); +#endif + return SAVE_STATUS(ctx, res); } - if (res) break; - ctx++; cfg++; dsf++; diff --git a/libs/libvpx/vpx/svc_context.h b/libs/libvpx/vpx/svc_context.h index c8bde5832a..462785075c 100644 --- a/libs/libvpx/vpx/svc_context.h +++ b/libs/libvpx/vpx/svc_context.h @@ -54,7 +54,7 @@ typedef struct SvcInternal { // values extracted from option, quantizers vpx_svc_extra_cfg_t svc_params; int enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; - int bitrates[VPX_SS_MAX_LAYERS]; + int bitrates[VPX_MAX_LAYERS]; // accumulated statistics double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V diff --git a/libs/libvpx/vpx/vp8cx.h b/libs/libvpx/vpx/vp8cx.h index 8fa25e8bc0..c21b8b60db 100644 --- a/libs/libvpx/vpx/vp8cx.h +++ b/libs/libvpx/vpx/vp8cx.h @@ -333,11 +333,12 @@ enum vp8e_enc_control_id { * 2 = 4 tile columns * ..... * n = 2**n tile columns - * The requested tile columns will be capped by encoder based on image size - * limitation (The minimum width of a tile column is 256 pixel, the maximum - * is 4096). + * The requested tile columns will be capped by the encoder based on image + * size limitations (The minimum width of a tile column is 256 pixels, the + * maximum is 4096). * - * By default, the value is 0, i.e. one single column tile for entire image. + * By default, the value is 6, i.e., the maximum number of tiles supported by + * the resolution. * * Supported in codecs: VP9 */ @@ -368,10 +369,10 @@ enum vp8e_enc_control_id { * VP9 has a bitstream feature to reduce decoding dependency between frames * by turning off backward update of probability context used in encoding * and decoding. This allows staged parallel processing of more than one - * video frames in the decoder. This control function provides a mean to + * video frame in the decoder. This control function provides a means to * turn this feature on or off for bitstreams produced by encoder. * - * By default, this feature is off. + * By default, this feature is on. * * Supported in codecs: VP9 */ @@ -407,7 +408,7 @@ enum vp8e_enc_control_id { /*!\brief Codec control function to set noise sensitivity. * - * 0: off, 1: On(YOnly) + * 0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly) * * Supported in codecs: VP9 */ @@ -443,6 +444,7 @@ enum vp8e_enc_control_id { * \note Valid parameter range: * VP9E_CONTENT_DEFAULT = Regular video content (Default) * VP9E_CONTENT_SCREEN = Screen capture content + * VP9E_CONTENT_FILM = Film content: improves grain retention * * Supported in codecs: VP9 */ @@ -547,6 +549,14 @@ enum vp8e_enc_control_id { */ VP9E_SET_TARGET_LEVEL, + /*!\brief Codec control function to set row level multi-threading. + * + * 0 : off, 1 : on + * + * Supported in codecs: VP9 + */ + VP9E_SET_ROW_MT, + /*!\brief Codec control function to get bitstream level. * * Supported in codecs: VP9 @@ -561,7 +571,31 @@ enum vp8e_enc_control_id { * * Supported in codecs: VP9 */ - VP9E_SET_ALT_REF_AQ + VP9E_SET_ALT_REF_AQ, + + /*!\brief Boost percentage for Golden Frame in CBR mode. + * + * This value controls the amount of boost given to Golden Frame in + * CBR mode. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * the feature is off, i.e., no golden frame boost in CBR mode and + * average bitrate target is used. + * + * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * than average frame, set this to 100. + * + * Supported in codecs: VP8 + */ + VP8E_SET_GF_CBR_BOOST_PCT, + + /*!\brief Codec control function to enable the extreme motion vector unit test + * in VP9. Please note that this is only used in motion vector unit test. + * + * 0 : off, 1 : MAX_EXTREME_MV, 2 : MIN_EXTREME_MV + * + * Supported in codecs: VP9 + */ + VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, }; /*!\brief vpx 1-D scaling mode @@ -663,6 +697,7 @@ typedef enum { typedef enum { VP9E_CONTENT_DEFAULT, VP9E_CONTENT_SCREEN, + VP9E_CONTENT_FILM, VP9E_CONTENT_INVALID } vp9e_tune_content; @@ -769,6 +804,9 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int) #define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT +VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int) +#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT + VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int) #define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE @@ -820,9 +858,15 @@ VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *) VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int) #define VPX_CTRL_VP9E_SET_TARGET_LEVEL +VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int) +#define VPX_CTRL_VP9E_SET_ROW_MT + VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) #define VPX_CTRL_VP9E_GET_LEVEL +VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int) +#define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST + /*!\endcond */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus diff --git a/libs/libvpx/vpx/vp8dx.h b/libs/libvpx/vpx/vp8dx.h index 88204acd37..398c670220 100644 --- a/libs/libvpx/vpx/vp8dx.h +++ b/libs/libvpx/vpx/vp8dx.h @@ -111,6 +111,19 @@ enum vp8_dec_control_id { */ VP9_SET_SKIP_LOOP_FILTER, + /** control function to decode SVC stream up to the x spatial layers, + * where x is passed in through the control, and is 0 for base layer. + */ + VP9_DECODE_SVC_SPATIAL_LAYER, + + /*!\brief Codec control function to get last decoded frame quantizer. + * + * Return value uses internal quantizer scale defined by the codec. + * + * Supported in codecs: VP8, VP9 + */ + VPXD_GET_LAST_QUANTIZER, + VP8_DECODER_CTRL_ID_MAX }; @@ -150,6 +163,8 @@ VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) #define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) #define VPX_CTRL_VP8D_GET_LAST_REF_USED +VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *) +#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) #define VPX_CTRL_VPXD_SET_DECRYPTOR VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) @@ -162,6 +177,10 @@ VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) #define VPX_CTRL_VP9D_GET_FRAME_SIZE VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER +#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER +VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) +#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER +VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) /*!\endcond */ /*! @} - end defgroup vp8_decoder */ diff --git a/libs/libvpx/vpx/vpx_codec.h b/libs/libvpx/vpx/vpx_codec.h index fe75d23872..ad05f4c74e 100644 --- a/libs/libvpx/vpx/vpx_codec.h +++ b/libs/libvpx/vpx/vpx_codec.h @@ -42,38 +42,39 @@ extern "C" { #endif -#include "./vpx_integer.h" #include "./vpx_image.h" +#include "./vpx_integer.h" /*!\brief Decorator indicating a function is deprecated */ -#ifndef DEPRECATED +#ifndef VPX_DEPRECATED #if defined(__GNUC__) && __GNUC__ -#define DEPRECATED __attribute__((deprecated)) +#define VPX_DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) -#define DEPRECATED +#define VPX_DEPRECATED #else -#define DEPRECATED +#define VPX_DEPRECATED #endif -#endif /* DEPRECATED */ +#endif /* VPX_DEPRECATED */ -#ifndef DECLSPEC_DEPRECATED +#ifndef VPX_DECLSPEC_DEPRECATED #if defined(__GNUC__) && __GNUC__ -#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */ #elif defined(_MSC_VER) -/*!\brief \copydoc #DEPRECATED */ -#define DECLSPEC_DEPRECATED __declspec(deprecated) +/*!\brief \copydoc #VPX_DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated) #else -#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */ #endif -#endif /* DECLSPEC_DEPRECATED */ +#endif /* VPX_DECLSPEC_DEPRECATED */ /*!\brief Decorator indicating a function is potentially unused */ -#ifdef UNUSED -#elif defined(__GNUC__) || defined(__clang__) -#define UNUSED __attribute__((unused)) +#ifndef VPX_UNUSED +#if defined(__GNUC__) || defined(__clang__) +#define VPX_UNUSED __attribute__((unused)) #else -#define UNUSED +#define VPX_UNUSED #endif +#endif /* VPX_UNUSED */ /*!\brief Current ABI version number * @@ -83,7 +84,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_CODEC_ABI_VERSION (4 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/ /*!\brief Algorithm return codes */ typedef enum { @@ -152,6 +153,10 @@ typedef long vpx_codec_caps_t; #define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ #define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ +/*! Can support images at greater than 8 bitdepth. + */ +#define VPX_CODEC_CAP_HIGHBITDEPTH 0x4 + /*! \brief Initialization-time Feature Enabling * * Certain codec features must be known at initialization time, to allow for @@ -409,7 +414,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); */ #define VPX_CTRL_USE_TYPE(id, typ) \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \ - UNUSED; \ + VPX_UNUSED; \ \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ int ctrl_id, typ data) { \ @@ -426,13 +431,13 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); * It defines a static function with the correctly typed arguments as a * wrapper to the type-unsafe internal function. */ -#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ - DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ - vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED; \ - \ - DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ - vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \ - return vpx_codec_control_(ctx, ctrl_id, data); \ +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ + VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED; \ + \ + VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \ + return vpx_codec_control_(ctx, ctrl_id, data); \ } /**<\hideinitializer*/ /*!\brief vpx_codec_control void type definition macro @@ -447,7 +452,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); */ #define VPX_CTRL_VOID(id) \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \ - UNUSED; \ + VPX_UNUSED; \ \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ int ctrl_id) { \ diff --git a/libs/libvpx/vpx/vpx_encoder.h b/libs/libvpx/vpx/vpx_encoder.h index 28fcd5f999..464bc408c8 100644 --- a/libs/libvpx/vpx/vpx_encoder.h +++ b/libs/libvpx/vpx/vpx_encoder.h @@ -63,7 +63,7 @@ extern "C" { * fields to structures */ #define VPX_ENCODER_ABI_VERSION \ - (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + (6 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -83,10 +83,6 @@ extern "C" { */ #define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000 -/*! Can support input images at greater than 8 bitdepth. - */ -#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000 - /*! \brief Initialization-time Feature Enabling * * Certain codec features must be known at initialization time, to allow @@ -158,9 +154,8 @@ enum vpx_codec_cx_pkt_kind { VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ -// Spatial SVC is still experimental and may be removed before the next ABI -// bump. -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) +// Spatial SVC is still experimental and may be removed. +#if defined(VPX_TEST_SPATIAL_SVC) VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/ VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/ #endif @@ -196,9 +191,8 @@ typedef struct vpx_codec_cx_pkt { double psnr[4]; /**< PSNR, total/y/u/v */ } psnr; /**< data for PSNR packet */ vpx_fixed_buf_t raw; /**< data for arbitrary packets */ -// Spatial SVC is still experimental and may be removed before the next -// ABI bump. -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) +// Spatial SVC is still experimental and may be removed. +#if defined(VPX_TEST_SPATIAL_SVC) size_t layer_sizes[VPX_SS_MAX_LAYERS]; struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS]; #endif @@ -512,25 +506,31 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Rate control adaptation undershoot control * - * This value, expressed as a percentage of the target bitrate, + * VP8: Expressed as a percentage of the target bitrate, * controls the maximum allowed adaptation speed of the codec. * This factor controls the maximum amount of bits that can * be subtracted from the target bitrate in order to compensate * for prior overshoot. - * - * Valid values in the range 0-1000. + * VP9: Expressed as a percentage of the target bitrate, a threshold + * undershoot level (current rate vs target) beyond which more agressive + * corrective measures are taken. + * * + * Valid values in the range VP8:0-1000 VP9: 0-100. */ unsigned int rc_undershoot_pct; /*!\brief Rate control adaptation overshoot control * - * This value, expressed as a percentage of the target bitrate, + * VP8: Expressed as a percentage of the target bitrate, * controls the maximum allowed adaptation speed of the codec. * This factor controls the maximum amount of bits that can * be added to the target bitrate in order to compensate for * prior undershoot. + * VP9: Expressed as a percentage of the target bitrate, a threshold + * overshoot level (current rate vs target) beyond which more agressive + * corrective measures are taken. * - * Valid values in the range 0-1000. + * Valid values in the range VP8:0-1000 VP9: 0-100. */ unsigned int rc_overshoot_pct; @@ -595,6 +595,13 @@ typedef struct vpx_codec_enc_cfg { */ unsigned int rc_2pass_vbr_maxsection_pct; + /*!\brief Two-pass corpus vbr mode complexity control + * Used only in VP9: A value representing the corpus midpoint complexity + * for corpus vbr mode. This value defaults to 0 which disables corpus vbr + * mode in favour of normal vbr mode. + */ + unsigned int rc_2pass_vbr_corpus_complexity; + /* * keyframing settings (kf) */ diff --git a/libs/libvpx/vpx_dsp/add_noise.c b/libs/libvpx/vpx_dsp/add_noise.c index a2b4c9010f..cda6ae8814 100644 --- a/libs/libvpx/vpx_dsp/add_noise.c +++ b/libs/libvpx/vpx_dsp/add_noise.c @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/postproc.h" #include "vpx_ports/mem.h" void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, diff --git a/libs/libvpx/vpx_dsp/arm/avg_neon.c b/libs/libvpx/vpx_dsp/arm/avg_neon.c index 001517d33e..fa7dd09600 100644 --- a/libs/libvpx/vpx_dsp/arm/avg_neon.c +++ b/libs/libvpx/vpx_dsp/arm/avg_neon.c @@ -15,62 +15,48 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { - const uint32x4_t a = vpaddlq_u16(v_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); +uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) { + const uint8x16_t b = load_unaligned_u8q(a, a_stride); + const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b)); + const uint32x2_t d = horizontal_add_uint16x8(c); + return vget_lane_u32(vrshr_n_u32(d, 4), 0); } -unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) { - uint16x8_t v_sum; - uint32x2_t v_s0 = vdup_n_u32(0); - uint32x2_t v_s1 = vdup_n_u32(0); - v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); - v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); - v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); - return (horizontal_add_u16x8(v_sum) + 8) >> 4; -} +uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) { + int i; + uint8x8_t b, c; + uint16x8_t sum; + uint32x2_t d; + b = vld1_u8(a); + a += a_stride; + c = vld1_u8(a); + a += a_stride; + sum = vaddl_u8(b, c); -unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) { - uint8x8_t v_s0 = vld1_u8(s); - const uint8x8_t v_s1 = vld1_u8(s + p); - uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); + for (i = 0; i < 6; ++i) { + const uint8x8_t d = vld1_u8(a); + a += a_stride; + sum = vaddw_u8(sum, d); + } - v_s0 = vld1_u8(s + 2 * p); - v_sum = vaddw_u8(v_sum, v_s0); + d = horizontal_add_uint16x8(sum); - v_s0 = vld1_u8(s + 3 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 4 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 5 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 6 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 7 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - return (horizontal_add_u16x8(v_sum) + 32) >> 6; + return vget_lane_u32(vrshr_n_u32(d, 6), 0); } // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. -int vpx_satd_neon(const int16_t *coeff, int length) { +int vpx_satd_neon(const tran_low_t *coeff, int length) { const int16x4_t zero = vdup_n_s16(0); int32x4_t accum = vdupq_n_s32(0); do { - const int16x8_t src0 = vld1q_s16(coeff); - const int16x8_t src8 = vld1q_s16(coeff + 8); + const int16x8_t src0 = load_tran_low_to_s16q(coeff); + const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8); accum = vabal_s16(accum, vget_low_s16(src0), zero); accum = vabal_s16(accum, vget_high_s16(src0), zero); accum = vabal_s16(accum, vget_low_s16(src8), zero); @@ -153,7 +139,8 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { ref += 16; } - return horizontal_add_u16x8(vec_sum); + return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)), + 0); } // ref, src = [0, 510] - max diff = 16-bits @@ -183,7 +170,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { { // Note: 'total''s pairwise addition could be implemented similarly to - // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired + // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired // with the summation of 'sse' performed better on a Cortex-A15. const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); diff --git a/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c b/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c new file mode 100644 index 0000000000..1370ec2d2e --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + if (width > 8) { + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; x += 16) { + const uint8x16_t p = vld1q_u8(pred + x); + const uint8x16_t r = vld1q_u8(ref + x); + const uint8x16_t avg = vrhaddq_u8(p, r); + vst1q_u8(comp + x, avg); + } + comp += width; + pred += width; + ref += ref_stride; + } + } else { + int i; + for (i = 0; i < width * height; i += 16) { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + + if (width == 4) { + r = load_unaligned_u8q(ref, ref_stride); + ref += 4 * ref_stride; + } else { + const uint8x8_t r_0 = vld1_u8(ref); + const uint8x8_t r_1 = vld1_u8(ref + ref_stride); + assert(width == 8); + r = vcombine_u8(r_0, r_1); + ref += 2 * ref_stride; + } + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + } + } +} diff --git a/libs/libvpx/vpx_dsp/arm/deblock_neon.c b/libs/libvpx/vpx_dsp/arm/deblock_neon.c new file mode 100644 index 0000000000..1fb41d2992 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/deblock_neon.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" + +extern const int16_t vpx_rv[]; + +static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2) { + const uint8x8_t k1 = vrhadd_u8(a2, a1); + const uint8x8_t k2 = vrhadd_u8(b2, b1); + const uint8x8_t k3 = vrhadd_u8(k1, k2); + return vrhadd_u8(k3, v0); +} + +static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2, const uint8x8_t filter) { + const uint8x8_t a2_v0 = vabd_u8(a2, v0); + const uint8x8_t a1_v0 = vabd_u8(a1, v0); + const uint8x8_t b1_v0 = vabd_u8(b1, v0); + const uint8x8_t b2_v0 = vabd_u8(b2, v0); + + uint8x8_t max = vmax_u8(a2_v0, a1_v0); + max = vmax_u8(b1_v0, max); + max = vmax_u8(b2_v0, max); + return vclt_u8(max, filter); +} + +static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2, const uint8x8_t filter) { + const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2); + const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter); + + return vbsl_u8(mask, k_out, v0); +} + +// Same functions but for uint8x16_t. +static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2) { + const uint8x16_t k1 = vrhaddq_u8(a2, a1); + const uint8x16_t k2 = vrhaddq_u8(b2, b1); + const uint8x16_t k3 = vrhaddq_u8(k1, k2); + return vrhaddq_u8(k3, v0); +} + +static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2, const uint8x16_t filter) { + const uint8x16_t a2_v0 = vabdq_u8(a2, v0); + const uint8x16_t a1_v0 = vabdq_u8(a1, v0); + const uint8x16_t b1_v0 = vabdq_u8(b1, v0); + const uint8x16_t b2_v0 = vabdq_u8(b2, v0); + + uint8x16_t max = vmaxq_u8(a2_v0, a1_v0); + max = vmaxq_u8(b1_v0, max); + max = vmaxq_u8(b2_v0, max); + return vcltq_u8(max, filter); +} + +static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2, + const uint8x16_t filter) { + const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2); + const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter); + + return vbslq_u8(mask, k_out, v0); +} + +void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int dst_stride, int cols, + uint8_t *f, int size) { + uint8_t *src, *dst; + int row; + int col; + + // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for + // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8 + // (for U/V). + assert((size == 8 || size == 16) && cols % 8 == 0); + + // While columns of length 16 can be processed, load them. + for (col = 0; col < cols - 8; col += 16) { + uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7; + src = src_ptr - 2 * src_stride; + dst = dst_ptr; + + a0 = vld1q_u8(src); + src += src_stride; + a1 = vld1q_u8(src); + src += src_stride; + a2 = vld1q_u8(src); + src += src_stride; + a3 = vld1q_u8(src); + src += src_stride; + + for (row = 0; row < size; row += 4) { + uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3; + const uint8x16_t filterq = vld1q_u8(f + col); + + a4 = vld1q_u8(src); + src += src_stride; + a5 = vld1q_u8(src); + src += src_stride; + a6 = vld1q_u8(src); + src += src_stride; + a7 = vld1q_u8(src); + src += src_stride; + + v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq); + v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq); + v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq); + v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq); + + vst1q_u8(dst, v_out_0); + dst += dst_stride; + vst1q_u8(dst, v_out_1); + dst += dst_stride; + vst1q_u8(dst, v_out_2); + dst += dst_stride; + vst1q_u8(dst, v_out_3); + dst += dst_stride; + + // Rotate over to the next slot. + a0 = a4; + a1 = a5; + a2 = a6; + a3 = a7; + } + + src_ptr += 16; + dst_ptr += 16; + } + + // Clean up any left over column of length 8. + if (col != cols) { + uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7; + src = src_ptr - 2 * src_stride; + dst = dst_ptr; + + a0 = vld1_u8(src); + src += src_stride; + a1 = vld1_u8(src); + src += src_stride; + a2 = vld1_u8(src); + src += src_stride; + a3 = vld1_u8(src); + src += src_stride; + + for (row = 0; row < size; row += 4) { + uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3; + const uint8x8_t filter = vld1_u8(f + col); + + a4 = vld1_u8(src); + src += src_stride; + a5 = vld1_u8(src); + src += src_stride; + a6 = vld1_u8(src); + src += src_stride; + a7 = vld1_u8(src); + src += src_stride; + + v_out_0 = generate_output(a0, a1, a2, a3, a4, filter); + v_out_1 = generate_output(a1, a2, a3, a4, a5, filter); + v_out_2 = generate_output(a2, a3, a4, a5, a6, filter); + v_out_3 = generate_output(a3, a4, a5, a6, a7, filter); + + vst1_u8(dst, v_out_0); + dst += dst_stride; + vst1_u8(dst, v_out_1); + dst += dst_stride; + vst1_u8(dst, v_out_2); + dst += dst_stride; + vst1_u8(dst, v_out_3); + dst += dst_stride; + + // Rotate over to the next slot. + a0 = a4; + a1 = a5; + a2 = a6; + a3 = a7; + } + + // Not strictly necessary but makes resetting dst_ptr easier. + dst_ptr += 8; + } + + dst_ptr -= cols; + + for (row = 0; row < size; row += 8) { + uint8x8_t a0, a1, a2, a3; + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; + + src = dst_ptr; + dst = dst_ptr; + + // Load 8 values, transpose 4 of them, and discard 2 because they will be + // reloaded later. + load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3); + a3 = a1; + a2 = a1 = a0; // Extend left border. + + src += 2; + + for (col = 0; col < cols; col += 8) { + uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6, + v_out_7; + // Although the filter is meant to be applied vertically and is instead + // being applied horizontally here it's OK because it's set in blocks of 8 + // (or 16). + const uint8x8_t filter = vld1_u8(f + col); + + load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5, + &b6, &b7); + + if (col + 8 == cols) { + // Last row. Extend border (b5). + b6 = b7 = b5; + } + + v_out_0 = generate_output(a0, a1, a2, a3, b0, filter); + v_out_1 = generate_output(a1, a2, a3, b0, b1, filter); + v_out_2 = generate_output(a2, a3, b0, b1, b2, filter); + v_out_3 = generate_output(a3, b0, b1, b2, b3, filter); + v_out_4 = generate_output(b0, b1, b2, b3, b4, filter); + v_out_5 = generate_output(b1, b2, b3, b4, b5, filter); + v_out_6 = generate_output(b2, b3, b4, b5, b6, filter); + v_out_7 = generate_output(b3, b4, b5, b6, b7, filter); + + transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2, + v_out_3, v_out_4, v_out_5, v_out_6, v_out_7); + + a0 = b4; + a1 = b5; + a2 = b6; + a3 = b7; + + src += 8; + dst += 8; + } + + dst_ptr += 8 * dst_stride; + } +} + +// sum += x; +// sumsq += x * y; +static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy, + int16x4_t *const sum, int32x4_t *const sumsq) { + const int16x4_t zero = vdup_n_s16(0); + const int32x4_t zeroq = vdupq_n_s32(0); + + // Add in the first set because vext doesn't work with '0'. + *sum = vadd_s16(*sum, x); + *sumsq = vaddq_s32(*sumsq, xy); + + // Shift x and xy to the right and sum. vext requires an immediate. + *sum = vadd_s16(*sum, vext_s16(zero, x, 1)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1)); + + *sum = vadd_s16(*sum, vext_s16(zero, x, 2)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2)); + + *sum = vadd_s16(*sum, vext_s16(zero, x, 3)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3)); +} + +// Generate mask based on (sumsq * 15 - sum * sum < flimit) +static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq, + const int32x4_t f, const int32x4_t fifteen) { + const int32x4_t a = vmulq_s32(sumsq, fifteen); + const int32x4_t b = vmlsl_s16(a, sum, sum); + const uint32x4_t mask32 = vcltq_s32(b, f); + return vmovn_u32(mask32); +} + +static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high, + const int32x4_t sumsq_low, + const int32x4_t sumsq_high, const int32x4_t f) { + const int32x4_t fifteen = vdupq_n_s32(15); + const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen); + const uint16x4_t mask16_high = + calculate_mask(sum_high, sumsq_high, f, fifteen); + return vmovn_u16(vcombine_u16(mask16_low, mask16_high)); +} + +// Apply filter of (8 + sum + s[c]) >> 4. +static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + + return vqrshrun_n_s16(sum_s, 4); +} + +void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols, + int flimit) { + int row, col; + const int32x4_t f = vdupq_n_s32(flimit); + + assert(cols % 8 == 0); + + for (row = 0; row < rows; ++row) { + // Sum the first 8 elements, which are extended from s[0]. + // sumsq gets primed with +16. + int sumsq = src[0] * src[0] * 9 + 16; + int sum = src[0] * 9; + + uint8x8_t left_context, s, right_context; + int16x4_t sum_low, sum_high; + int32x4_t sumsq_low, sumsq_high; + + // Sum (+square) the next 6 elements. + // Skip [0] because it's included above. + for (col = 1; col <= 6; ++col) { + sumsq += src[col] * src[col]; + sum += src[col]; + } + + // Prime the sums. Later the loop uses the _high values to prime the new + // vectors. + sumsq_high = vdupq_n_s32(sumsq); + sum_high = vdup_n_s16(sum); + + // Manually extend the left border. + left_context = vdup_n_u8(src[0]); + + for (col = 0; col < cols; col += 8) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(src + col); + + if (col + 8 == cols) { + // Last row. Extend border. + right_context = vdup_n_u8(src[col + 7]); + } else { + right_context = vld1_u8(src + col + 7); + } + + x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context)); + y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context)); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + // Catch up to the last sum'd value. + sum_low = vdup_lane_s16(sum_high, 3); + sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1); + + accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low); + + // Need to do this sequentially because we need the max value from + // sum_low. + sum_high = vdup_lane_s16(sum_low, 3); + sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1); + + accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high); + + mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f); + + output = filter_pixels(vcombine_s16(sum_low, sum_high), s); + output = vbsl_u8(mask, output, s); + + vst1_u8(src + col, output); + + left_context = s; + } + + src += pitch; + } +} + +// Apply filter of (vpx_rv + sum + s[c]) >> 4. +static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s, + const int16x8_t rv) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + const int16x8_t rounded = vaddq_s16(sum_s, rv); + + return vqshrun_n_s16(rounded, 4); +} + +void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int row, col, i; + const int32x4_t f = vdupq_n_s32(flimit); + uint8x8_t below_context = vdup_n_u8(0); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + // Load and keep the first 8 values in memory. Process a vertical stripe that + // is 8 wide. + for (col = 0; col < cols; col += 8) { + uint8x8_t s, above_context[8]; + int16x8_t sum, sum_tmp; + int32x4_t sumsq_low, sumsq_high; + + // Load and extend the top border. + s = vld1_u8(dst); + for (i = 0; i < 8; i++) { + above_context[i] = s; + } + + sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s)); + + // sum * 9 + sum = vmulq_n_s16(sum_tmp, 9); + + // (sum * 9) * sum == sum * sum * 9 + sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp)); + sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp)); + + // Load and discard the next 6 values to prime sum and sumsq. + for (i = 1; i <= 6; ++i) { + const uint8x8_t a = vld1_u8(dst + i * pitch); + const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a)); + sum = vaddq_s16(sum, b); + + sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b)); + sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b)); + } + + for (row = 0; row < rows; ++row) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(dst + row * pitch); + + // Extend the bottom border. + if (row + 7 < rows) { + below_context = vld1_u8(dst + (row + 7) * pitch); + } + + x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0])); + y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0])); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + sum = vaddq_s16(sum, x); + + sumsq_low = vaddq_s32(sumsq_low, xy_low); + sumsq_high = vaddq_s32(sumsq_high, xy_high); + + mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low, + sumsq_high, f); + + output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127))); + output = vbsl_u8(mask, output, s); + + vst1_u8(dst + row * pitch, output); + + above_context[0] = above_context[1]; + above_context[1] = above_context[2]; + above_context[2] = above_context[3]; + above_context[3] = above_context[4]; + above_context[4] = above_context[5]; + above_context[5] = above_context[6]; + above_context[6] = above_context[7]; + above_context[7] = s; + } + + dst += 8; + } +} diff --git a/libs/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libs/libvpx/vpx_dsp/arm/fdct16x16_neon.c new file mode 100644 index 0000000000..6b2bebd097 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/fdct16x16_neon.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline +// functions. +#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 + +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { + vpx_fdct16x16_c(input, output, stride); +} + +#else + +static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { + b[0] = vld1q_s16(a); + a += stride; + b[1] = vld1q_s16(a); + a += stride; + b[2] = vld1q_s16(a); + a += stride; + b[3] = vld1q_s16(a); + a += stride; + b[4] = vld1q_s16(a); + a += stride; + b[5] = vld1q_s16(a); + a += stride; + b[6] = vld1q_s16(a); + a += stride; + b[7] = vld1q_s16(a); + a += stride; + b[8] = vld1q_s16(a); + a += stride; + b[9] = vld1q_s16(a); + a += stride; + b[10] = vld1q_s16(a); + a += stride; + b[11] = vld1q_s16(a); + a += stride; + b[12] = vld1q_s16(a); + a += stride; + b[13] = vld1q_s16(a); + a += stride; + b[14] = vld1q_s16(a); + a += stride; + b[15] = vld1q_s16(a); +} + +// Store 8 16x8 values, assuming stride == 16. +static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { + store_s16q_to_tran_low(a, b[0]); + a += 16; + store_s16q_to_tran_low(a, b[1]); + a += 16; + store_s16q_to_tran_low(a, b[2]); + a += 16; + store_s16q_to_tran_low(a, b[3]); + a += 16; + store_s16q_to_tran_low(a, b[4]); + a += 16; + store_s16q_to_tran_low(a, b[5]); + a += 16; + store_s16q_to_tran_low(a, b[6]); + a += 16; + store_s16q_to_tran_low(a, b[7]); +} + +// Load step of each pass. Add and subtract clear across the input, requiring +// all 16 values to be loaded. For the first pass it also multiplies by 4. + +// To maybe reduce register usage this could be combined with the load() step to +// get the first 4 and last 4 values, cross those, then load the middle 8 values +// and cross them. +static INLINE void cross_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/, const int pass) { + if (pass == 0) { + b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); + b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); + b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); + b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); + b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); + b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); + b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); + b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); + + b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); + b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); + b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); + b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); + b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); + b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); + b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); + b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); + } else { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + } +} + +// Quarter round at the beginning of the second pass. Can't use vrshr (rounding) +// because this only adds 1, not 1 << 2. +static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { + const int16x8_t one = vdupq_n_s16(1); + a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2); + a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2); + a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2); + a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2); + a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2); + a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2); + a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2); + a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2); + a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2); + a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2); + a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2); + a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2); + a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2); + a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2); + a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2); + a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); +} + +// fdct_round_shift((a +/- b) * c) +static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, + const tran_high_t c, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c0 +/- b * c1) +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t c0, + const tran_coef_t c1, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); + const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); + const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); + const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); + const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// Transpose 8x8 to a new location. Don't use transpose_neon.h because those +// are all in-place. +static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, + int16x8_t *b /*[8]*/) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + +// Main body of fdct16x16. +static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) { + int16x8_t s[8]; + int16x8_t x[4]; + int16x8_t step[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); + + // Stage 3 + x[0] = vaddq_s16(s[4], s[5]); + x[1] = vsubq_s16(s[4], s[5]); + x[2] = vsubq_s16(s[7], s[6]); + x[3] = vaddq_s16(s[7], s[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); + + // step 3 + s[0] = vaddq_s16(in[8], s[3]); + s[1] = vaddq_s16(in[9], s[2]); + x[0] = vsubq_s16(in[9], s[2]); + x[1] = vsubq_s16(in[8], s[3]); + x[2] = vsubq_s16(in[15], s[4]); + x[3] = vsubq_s16(in[14], s[5]); + s[6] = vaddq_s16(in[14], s[5]); + s[7] = vaddq_s16(in[15], s[4]); + + // step 4 + // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) + // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); + + // step 5 + step[0] = vaddq_s16(s[0], s[1]); + step[1] = vsubq_s16(s[0], s[1]); + step[2] = vaddq_s16(x[1], s[2]); + step[3] = vsubq_s16(x[1], s[2]); + step[4] = vsubq_s16(x[2], s[5]); + step[5] = vaddq_s16(x[2], s[5]); + step[6] = vsubq_s16(s[7], s[6]); + step[7] = vaddq_s16(s[7], s[6]); + + // step 6 + // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) + // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) + // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) + // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) + // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) + // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * + // cospi_22_64) + // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) + // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) + butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], + &out[7]); + butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], + &out[15]); + butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], + &out[3]); + butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], + &out[11]); +} + +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x8_t temp0[16]; + int16x8_t temp1[16]; + int16x8_t temp2[16]; + int16x8_t temp3[16]; + + // Left half. + load(input, stride, temp0); + cross_input(temp0, temp1, 0); + dct_body(temp1, temp0); + + // Right half. + load(input + 8, stride, temp1); + cross_input(temp1, temp2, 0); + dct_body(temp2, temp1); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + transpose_8x8(&temp0[0], &temp2[0]); + transpose_8x8(&temp1[0], &temp2[8]); + partial_round_shift(temp2); + cross_input(temp2, temp3, 1); + dct_body(temp3, temp2); + transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], + &temp2[5], &temp2[6], &temp2[7]); + transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], + &temp2[13], &temp2[14], &temp2[15]); + store(output, temp2); + store(output + 8, temp2 + 8); + output += 8 * 16; + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + transpose_8x8(&temp0[8], &temp1[0]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], + &temp1[13], &temp1[14], &temp1[15]); + partial_round_shift(temp1); + cross_input(temp1, temp0, 1); + dct_body(temp0, temp1); + transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], + &temp1[5], &temp1[6], &temp1[7]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], + &temp1[13], &temp1[14], &temp1[15]); + store(output, temp1); + store(output + 8, temp1 + 8); +} +#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && + // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 diff --git a/libs/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libs/libvpx/vpx_dsp/arm/fdct32x32_neon.c new file mode 100644 index 0000000000..e9cd34904b --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/fdct32x32_neon.c @@ -0,0 +1,1507 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +// Most gcc 4.9 distributions outside of Android do not generate correct code +// for this function. +#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ <= 9 + +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { + vpx_fdct32x32_c(input, output, stride); +} + +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_rd_c(input, output, stride); +} + +#else + +#define LOAD_INCREMENT(src, stride, dest, index) \ + do { \ + dest[index] = vld1q_s16(src); \ + src += stride; \ + } while (0) + +#define ADD_S16(src, index0, index1, dest, index3) \ + do { \ + dest[index3] = vaddq_s16(src[index0], src[index1]); \ + } while (0) + +#define ADD_SHIFT_S16(src, index0, index1) \ + do { \ + src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \ + } while (0) + +// Load, cross, and multiply by 4. Load the first 8 and last 8, then the +// middle +// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better? +static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { + const int16_t *a_end = a + 24 * stride; + int16x8_t c[8]; + + LOAD_INCREMENT(a, stride, b, 0); + LOAD_INCREMENT(a, stride, b, 1); + LOAD_INCREMENT(a, stride, b, 2); + LOAD_INCREMENT(a, stride, b, 3); + LOAD_INCREMENT(a, stride, b, 4); + LOAD_INCREMENT(a, stride, b, 5); + LOAD_INCREMENT(a, stride, b, 6); + LOAD_INCREMENT(a, stride, b, 7); + + LOAD_INCREMENT(a_end, stride, b, 24); + LOAD_INCREMENT(a_end, stride, b, 25); + LOAD_INCREMENT(a_end, stride, b, 26); + LOAD_INCREMENT(a_end, stride, b, 27); + LOAD_INCREMENT(a_end, stride, b, 28); + LOAD_INCREMENT(a_end, stride, b, 29); + LOAD_INCREMENT(a_end, stride, b, 30); + LOAD_INCREMENT(a_end, stride, b, 31); + + ADD_S16(b, 0, 31, c, 0); + ADD_S16(b, 1, 30, c, 1); + ADD_S16(b, 2, 29, c, 2); + ADD_S16(b, 3, 28, c, 3); + ADD_S16(b, 4, 27, c, 4); + ADD_S16(b, 5, 26, c, 5); + ADD_S16(b, 6, 25, c, 6); + ADD_S16(b, 7, 24, c, 7); + + ADD_SHIFT_S16(b, 7, 24); + ADD_SHIFT_S16(b, 6, 25); + ADD_SHIFT_S16(b, 5, 26); + ADD_SHIFT_S16(b, 4, 27); + ADD_SHIFT_S16(b, 3, 28); + ADD_SHIFT_S16(b, 2, 29); + ADD_SHIFT_S16(b, 1, 30); + ADD_SHIFT_S16(b, 0, 31); + + b[0] = vshlq_n_s16(c[0], 2); + b[1] = vshlq_n_s16(c[1], 2); + b[2] = vshlq_n_s16(c[2], 2); + b[3] = vshlq_n_s16(c[3], 2); + b[4] = vshlq_n_s16(c[4], 2); + b[5] = vshlq_n_s16(c[5], 2); + b[6] = vshlq_n_s16(c[6], 2); + b[7] = vshlq_n_s16(c[7], 2); + + LOAD_INCREMENT(a, stride, b, 8); + LOAD_INCREMENT(a, stride, b, 9); + LOAD_INCREMENT(a, stride, b, 10); + LOAD_INCREMENT(a, stride, b, 11); + LOAD_INCREMENT(a, stride, b, 12); + LOAD_INCREMENT(a, stride, b, 13); + LOAD_INCREMENT(a, stride, b, 14); + LOAD_INCREMENT(a, stride, b, 15); + LOAD_INCREMENT(a, stride, b, 16); + LOAD_INCREMENT(a, stride, b, 17); + LOAD_INCREMENT(a, stride, b, 18); + LOAD_INCREMENT(a, stride, b, 19); + LOAD_INCREMENT(a, stride, b, 20); + LOAD_INCREMENT(a, stride, b, 21); + LOAD_INCREMENT(a, stride, b, 22); + LOAD_INCREMENT(a, stride, b, 23); + + ADD_S16(b, 8, 23, c, 0); + ADD_S16(b, 9, 22, c, 1); + ADD_S16(b, 10, 21, c, 2); + ADD_S16(b, 11, 20, c, 3); + ADD_S16(b, 12, 19, c, 4); + ADD_S16(b, 13, 18, c, 5); + ADD_S16(b, 14, 17, c, 6); + ADD_S16(b, 15, 16, c, 7); + + ADD_SHIFT_S16(b, 15, 16); + ADD_SHIFT_S16(b, 14, 17); + ADD_SHIFT_S16(b, 13, 18); + ADD_SHIFT_S16(b, 12, 19); + ADD_SHIFT_S16(b, 11, 20); + ADD_SHIFT_S16(b, 10, 21); + ADD_SHIFT_S16(b, 9, 22); + ADD_SHIFT_S16(b, 8, 23); + + b[8] = vshlq_n_s16(c[0], 2); + b[9] = vshlq_n_s16(c[1], 2); + b[10] = vshlq_n_s16(c[2], 2); + b[11] = vshlq_n_s16(c[3], 2); + b[12] = vshlq_n_s16(c[4], 2); + b[13] = vshlq_n_s16(c[5], 2); + b[14] = vshlq_n_s16(c[6], 2); + b[15] = vshlq_n_s16(c[7], 2); +} + +#undef LOAD_INCREMENT +#undef ADD_S16 +#undef ADD_SHIFT_S16 + +#define STORE_S16(src, index, dest) \ + do { \ + store_s16q_to_tran_low(dest, src[index]); \ + dest += 8; \ + } while (0); + +// Store 32 16x8 values, assuming stride == 32. +// Slight twist: store horizontally in blocks of 8. +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + STORE_S16(b, 0, a); + STORE_S16(b, 8, a); + STORE_S16(b, 16, a); + STORE_S16(b, 24, a); + STORE_S16(b, 1, a); + STORE_S16(b, 9, a); + STORE_S16(b, 17, a); + STORE_S16(b, 25, a); + STORE_S16(b, 2, a); + STORE_S16(b, 10, a); + STORE_S16(b, 18, a); + STORE_S16(b, 26, a); + STORE_S16(b, 3, a); + STORE_S16(b, 11, a); + STORE_S16(b, 19, a); + STORE_S16(b, 27, a); + STORE_S16(b, 4, a); + STORE_S16(b, 12, a); + STORE_S16(b, 20, a); + STORE_S16(b, 28, a); + STORE_S16(b, 5, a); + STORE_S16(b, 13, a); + STORE_S16(b, 21, a); + STORE_S16(b, 29, a); + STORE_S16(b, 6, a); + STORE_S16(b, 14, a); + STORE_S16(b, 22, a); + STORE_S16(b, 30, a); + STORE_S16(b, 7, a); + STORE_S16(b, 15, a); + STORE_S16(b, 23, a); + STORE_S16(b, 31, a); +} + +#undef STORE_S16 + +// fdct_round_shift((a +/- b) * c) +static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, + const tran_high_t constant, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c0 +/- b * c1) +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant0, + const tran_coef_t constant1, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); + const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); + const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +} + +static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + a[0] = vaddq_s16(in[0], in[15]); + a[1] = vaddq_s16(in[1], in[14]); + a[2] = vaddq_s16(in[2], in[13]); + a[3] = vaddq_s16(in[3], in[12]); + a[4] = vaddq_s16(in[4], in[11]); + a[5] = vaddq_s16(in[5], in[10]); + a[6] = vaddq_s16(in[6], in[9]); + a[7] = vaddq_s16(in[7], in[8]); + + a[8] = vsubq_s16(in[7], in[8]); + a[9] = vsubq_s16(in[6], in[9]); + a[10] = vsubq_s16(in[5], in[10]); + a[11] = vsubq_s16(in[4], in[11]); + a[12] = vsubq_s16(in[3], in[12]); + a[13] = vsubq_s16(in[2], in[13]); + a[14] = vsubq_s16(in[1], in[14]); + a[15] = vsubq_s16(in[0], in[15]); + + a[16] = in[16]; + a[17] = in[17]; + a[18] = in[18]; + a[19] = in[19]; + + butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]); + butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]); + butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]); + butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]); + + a[28] = in[28]; + a[29] = in[29]; + a[30] = in[30]; + a[31] = in[31]; + + // Stage 3. + b[0] = vaddq_s16(a[0], a[7]); + b[1] = vaddq_s16(a[1], a[6]); + b[2] = vaddq_s16(a[2], a[5]); + b[3] = vaddq_s16(a[3], a[4]); + + b[4] = vsubq_s16(a[3], a[4]); + b[5] = vsubq_s16(a[2], a[5]); + b[6] = vsubq_s16(a[1], a[6]); + b[7] = vsubq_s16(a[0], a[7]); + + b[8] = a[8]; + b[9] = a[9]; + + butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]); + butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]); + + b[14] = a[14]; + b[15] = a[15]; + + b[16] = vaddq_s16(in[16], a[23]); + b[17] = vaddq_s16(in[17], a[22]); + b[18] = vaddq_s16(in[18], a[21]); + b[19] = vaddq_s16(in[19], a[20]); + + b[20] = vsubq_s16(in[19], a[20]); + b[21] = vsubq_s16(in[18], a[21]); + b[22] = vsubq_s16(in[17], a[22]); + b[23] = vsubq_s16(in[16], a[23]); + + b[24] = vsubq_s16(in[31], a[24]); + b[25] = vsubq_s16(in[30], a[25]); + b[26] = vsubq_s16(in[29], a[26]); + b[27] = vsubq_s16(in[28], a[27]); + + b[28] = vaddq_s16(in[28], a[27]); + b[29] = vaddq_s16(in[29], a[26]); + b[30] = vaddq_s16(in[30], a[25]); + b[31] = vaddq_s16(in[31], a[24]); + + // Stage 4. + a[0] = vaddq_s16(b[0], b[3]); + a[1] = vaddq_s16(b[1], b[2]); + a[2] = vsubq_s16(b[1], b[2]); + a[3] = vsubq_s16(b[0], b[3]); + + a[4] = b[4]; + + butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]); + + a[7] = b[7]; + + a[8] = vaddq_s16(b[8], b[11]); + a[9] = vaddq_s16(b[9], b[10]); + a[10] = vsubq_s16(b[9], b[10]); + a[11] = vsubq_s16(b[8], b[11]); + a[12] = vsubq_s16(b[15], b[12]); + a[13] = vsubq_s16(b[14], b[13]); + a[14] = vaddq_s16(b[14], b[13]); + a[15] = vaddq_s16(b[15], b[12]); + + a[16] = b[16]; + a[17] = b[17]; + + butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]); + butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]); + butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]); + butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]); + + a[22] = b[22]; + a[23] = b[23]; + a[24] = b[24]; + a[25] = b[25]; + + a[30] = b[30]; + a[31] = b[31]; + + // Stage 5. + butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]); + butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]); + + b[4] = vaddq_s16(a[4], a[5]); + b[5] = vsubq_s16(a[4], a[5]); + b[6] = vsubq_s16(a[7], a[6]); + b[7] = vaddq_s16(a[7], a[6]); + + b[8] = a[8]; + + butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]); + butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]); + + b[11] = a[11]; + b[12] = a[12]; + + b[15] = a[15]; + + b[16] = vaddq_s16(a[19], a[16]); + b[17] = vaddq_s16(a[18], a[17]); + b[18] = vsubq_s16(a[17], a[18]); + b[19] = vsubq_s16(a[16], a[19]); + b[20] = vsubq_s16(a[23], a[20]); + b[21] = vsubq_s16(a[22], a[21]); + b[22] = vaddq_s16(a[21], a[22]); + b[23] = vaddq_s16(a[20], a[23]); + b[24] = vaddq_s16(a[27], a[24]); + b[25] = vaddq_s16(a[26], a[25]); + b[26] = vsubq_s16(a[25], a[26]); + b[27] = vsubq_s16(a[24], a[27]); + b[28] = vsubq_s16(a[31], a[28]); + b[29] = vsubq_s16(a[30], a[29]); + b[30] = vaddq_s16(a[29], a[30]); + b[31] = vaddq_s16(a[28], a[31]); + + // Stage 6. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + + butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]); + butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]); + + a[8] = vaddq_s16(b[8], b[9]); + a[9] = vsubq_s16(b[8], b[9]); + a[10] = vsubq_s16(b[11], b[10]); + a[11] = vaddq_s16(b[11], b[10]); + a[12] = vaddq_s16(b[12], b[13]); + a[13] = vsubq_s16(b[12], b[13]); + a[14] = vsubq_s16(b[15], b[14]); + a[15] = vaddq_s16(b[15], b[14]); + + a[16] = b[16]; + a[19] = b[19]; + a[20] = b[20]; + a[23] = b[23]; + a[24] = b[24]; + a[27] = b[27]; + a[28] = b[28]; + a[31] = b[31]; + + butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]); + butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]); + + butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]); + butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]); + + // Stage 7. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + b[4] = a[4]; + b[5] = a[5]; + b[6] = a[6]; + b[7] = a[7]; + + butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]); + butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]); + butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]); + butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]); + + b[16] = vaddq_s16(a[16], a[17]); + b[17] = vsubq_s16(a[16], a[17]); + b[18] = vsubq_s16(a[19], a[18]); + b[19] = vaddq_s16(a[19], a[18]); + b[20] = vaddq_s16(a[20], a[21]); + b[21] = vsubq_s16(a[20], a[21]); + b[22] = vsubq_s16(a[23], a[22]); + b[23] = vaddq_s16(a[23], a[22]); + b[24] = vaddq_s16(a[24], a[25]); + b[25] = vsubq_s16(a[24], a[25]); + b[26] = vsubq_s16(a[27], a[26]); + b[27] = vaddq_s16(a[27], a[26]); + b[28] = vaddq_s16(a[28], a[29]); + b[29] = vsubq_s16(a[28], a[29]); + b[30] = vsubq_s16(a[31], a[30]); + b[31] = vaddq_s16(a[31], a[30]); + + // Final stage. + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[0] = sub_round_shift(b[0]); + out[16] = sub_round_shift(b[1]); + out[8] = sub_round_shift(b[2]); + out[24] = sub_round_shift(b[3]); + out[4] = sub_round_shift(b[4]); + out[20] = sub_round_shift(b[5]); + out[12] = sub_round_shift(b[6]); + out[28] = sub_round_shift(b[7]); + out[2] = sub_round_shift(b[8]); + out[18] = sub_round_shift(b[9]); + out[10] = sub_round_shift(b[10]); + out[26] = sub_round_shift(b[11]); + out[6] = sub_round_shift(b[12]); + out[22] = sub_round_shift(b[13]); + out[14] = sub_round_shift(b[14]); + out[30] = sub_round_shift(b[15]); + + butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]); + out[1] = sub_round_shift(a[1]); + out[31] = sub_round_shift(a[31]); + + butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]); + out[17] = sub_round_shift(a[17]); + out[15] = sub_round_shift(a[15]); + + butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]); + out[9] = sub_round_shift(a[9]); + out[23] = sub_round_shift(a[23]); + + butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]); + out[25] = sub_round_shift(a[25]); + out[7] = sub_round_shift(a[7]); + + butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]); + out[5] = sub_round_shift(a[5]); + out[27] = sub_round_shift(a[27]); + + butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]); + out[21] = sub_round_shift(a[21]); + out[11] = sub_round_shift(a[11]); + + butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]); + out[13] = sub_round_shift(a[13]); + out[19] = sub_round_shift(a[19]); + + butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]); + out[29] = sub_round_shift(a[29]); + out[3] = sub_round_shift(a[3]); +} + +#define PASS_THROUGH(src, dst, element) \ + do { \ + dst##_lo[element] = src##_lo[element]; \ + dst##_hi[element] = src##_hi[element]; \ + } while (0) + +#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ + do { \ + c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ + c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ + } while (0) + +#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ + do { \ + temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ + temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ + c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ + c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ + } while (0) + +#define ADD_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define SUB_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +// Like butterfly_one_coeff, but don't narrow results. +static INLINE void butterfly_one_coeff_s16_s32( + const int16x8_t a, const int16x8_t b, const tran_high_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ + add_index, sub_index) \ + do { \ + butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ + &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +// Like butterfly_one_coeff, but with s32. +static INLINE void butterfly_one_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); + const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); + const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); + const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); + const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); + const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ + sub_index) \ + do { \ + butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + constant, &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +// Like butterfly_two_coeff, but with s32. +static INLINE void butterfly_two_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); + const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); + const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); + const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); + const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); + const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ + right_constant, b, add_index, sub_index) \ + do { \ + butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + left_constant, right_constant, &b##_lo[add_index], \ + &b##_hi[add_index], &b##_lo[sub_index], \ + &b##_hi[sub_index]); \ + } while (0) + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, + const int32x4_t a_hi) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); + const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); + const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); + const int16x4_t b_lo = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); + const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); + const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); + const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); + const int16x4_t b_hi = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); + return vcombine_s16(b_lo, b_hi); +} + +static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + int32x4_t c_lo[32]; + int32x4_t c_hi[32]; + int32x4_t d_lo[32]; + int32x4_t d_hi[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + + b[16] = a[16]; + b[17] = a[17]; + b[18] = a[18]; + b[19] = a[19]; + + butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); + + b[28] = a[28]; + b[29] = a[29]; + b[30] = a[30]; + b[31] = a[31]; + + // Stage 3. With extreme values for input this calculation rolls over int16_t. + // The sources for b[0] get added multiple times and, through testing, have + // been shown to overflow starting here. + ADD_S16_S32(b, 0, 7, c, 0); + ADD_S16_S32(b, 1, 6, c, 1); + ADD_S16_S32(b, 2, 5, c, 2); + ADD_S16_S32(b, 3, 4, c, 3); + SUB_S16_S32(b, 3, 4, c, 4); + SUB_S16_S32(b, 2, 5, c, 5); + SUB_S16_S32(b, 1, 6, c, 6); + SUB_S16_S32(b, 0, 7, c, 7); + + a[8] = b[8]; + a[9] = b[9]; + + BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); + BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); + + a[14] = b[14]; + a[15] = b[15]; + + ADD_S16_S32(b, 16, 23, c, 16); + ADD_S16_S32(b, 17, 22, c, 17); + ADD_S16_S32(b, 18, 21, c, 18); + ADD_S16_S32(b, 19, 20, c, 19); + SUB_S16_S32(b, 19, 20, c, 20); + SUB_S16_S32(b, 18, 21, c, 21); + SUB_S16_S32(b, 17, 22, c, 22); + SUB_S16_S32(b, 16, 23, c, 23); + SUB_S16_S32(b, 31, 24, c, 24); + SUB_S16_S32(b, 30, 25, c, 25); + SUB_S16_S32(b, 29, 26, c, 26); + SUB_S16_S32(b, 28, 27, c, 27); + ADD_S16_S32(b, 28, 27, c, 28); + ADD_S16_S32(b, 29, 26, c, 29); + ADD_S16_S32(b, 30, 25, c, 30); + ADD_S16_S32(b, 31, 24, c, 31); + + // Stage 4. + ADD_S32(c, 0, 3, d, 0); + ADD_S32(c, 1, 2, d, 1); + SUB_S32(c, 1, 2, d, 2); + SUB_S32(c, 0, 3, d, 3); + + PASS_THROUGH(c, d, 4); + + BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); + + PASS_THROUGH(c, d, 7); + + ADDW_S16_S32(c, 11, a, 8, d, 8); + ADDW_S16_S32(c, 10, a, 9, d, 9); + SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); + SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); + SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); + SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); + ADDW_S16_S32(c, 13, b, 14, d, 14); + ADDW_S16_S32(c, 12, b, 15, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 17); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19); + BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20); + BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21); + + PASS_THROUGH(c, d, 22); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 25); + + PASS_THROUGH(c, d, 30); + PASS_THROUGH(c, d, 31); + + // Stage 5. + BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); + BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3); + + ADD_S32(d, 4, 5, c, 4); + SUB_S32(d, 4, 5, c, 5); + SUB_S32(d, 7, 6, c, 6); + ADD_S32(d, 7, 6, c, 7); + + PASS_THROUGH(d, c, 8); + + BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9); + BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10); + + PASS_THROUGH(d, c, 11); + PASS_THROUGH(d, c, 12); + PASS_THROUGH(d, c, 15); + + ADD_S32(d, 16, 19, c, 16); + ADD_S32(d, 17, 18, c, 17); + SUB_S32(d, 17, 18, c, 18); + SUB_S32(d, 16, 19, c, 19); + SUB_S32(d, 23, 20, c, 20); + SUB_S32(d, 22, 21, c, 21); + ADD_S32(d, 22, 21, c, 22); + ADD_S32(d, 23, 20, c, 23); + ADD_S32(d, 24, 27, c, 24); + ADD_S32(d, 25, 26, c, 25); + SUB_S32(d, 25, 26, c, 26); + SUB_S32(d, 24, 27, c, 27); + SUB_S32(d, 31, 28, c, 28); + SUB_S32(d, 30, 29, c, 29); + ADD_S32(d, 30, 29, c, 30); + ADD_S32(d, 31, 28, c, 31); + + // Stage 6. + PASS_THROUGH(c, d, 0); + PASS_THROUGH(c, d, 1); + PASS_THROUGH(c, d, 2); + PASS_THROUGH(c, d, 3); + + BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7); + BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6); + + ADD_S32(c, 8, 9, d, 8); + SUB_S32(c, 8, 9, d, 9); + SUB_S32(c, 11, 10, d, 10); + ADD_S32(c, 11, 10, d, 11); + ADD_S32(c, 12, 13, d, 12); + SUB_S32(c, 12, 13, d, 13); + SUB_S32(c, 15, 14, d, 14); + ADD_S32(c, 15, 14, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 19); + PASS_THROUGH(c, d, 20); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 27); + PASS_THROUGH(c, d, 28); + PASS_THROUGH(c, d, 31); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17); + BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21); + BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22); + + // Stage 7. + PASS_THROUGH(d, c, 0); + PASS_THROUGH(d, c, 1); + PASS_THROUGH(d, c, 2); + PASS_THROUGH(d, c, 3); + PASS_THROUGH(d, c, 4); + PASS_THROUGH(d, c, 5); + PASS_THROUGH(d, c, 6); + PASS_THROUGH(d, c, 7); + + BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15); + BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13); + BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12); + + ADD_S32(d, 16, 17, c, 16); + SUB_S32(d, 16, 17, c, 17); + SUB_S32(d, 19, 18, c, 18); + ADD_S32(d, 19, 18, c, 19); + ADD_S32(d, 20, 21, c, 20); + SUB_S32(d, 20, 21, c, 21); + SUB_S32(d, 23, 22, c, 22); + ADD_S32(d, 23, 22, c, 23); + ADD_S32(d, 24, 25, c, 24); + SUB_S32(d, 24, 25, c, 25); + SUB_S32(d, 27, 26, c, 26); + ADD_S32(d, 27, 26, c, 27); + ADD_S32(d, 28, 29, c, 28); + SUB_S32(d, 28, 29, c, 29); + SUB_S32(d, 31, 30, c, 30); + ADD_S32(d, 31, 30, c, 31); + + // Final stage. + // Roll rounding into this function so we can pass back int16x8. + + out[0] = add_round_shift_s32(c_lo[0], c_hi[0]); + out[16] = add_round_shift_s32(c_lo[1], c_hi[1]); + + out[8] = add_round_shift_s32(c_lo[2], c_hi[2]); + out[24] = add_round_shift_s32(c_lo[3], c_hi[3]); + out[4] = add_round_shift_s32(c_lo[4], c_hi[4]); + out[20] = add_round_shift_s32(c_lo[5], c_hi[5]); + out[12] = add_round_shift_s32(c_lo[6], c_hi[6]); + + out[28] = add_round_shift_s32(c_lo[7], c_hi[7]); + out[2] = add_round_shift_s32(c_lo[8], c_hi[8]); + out[18] = add_round_shift_s32(c_lo[9], c_hi[9]); + out[10] = add_round_shift_s32(c_lo[10], c_hi[10]); + + out[26] = add_round_shift_s32(c_lo[11], c_hi[11]); + out[6] = add_round_shift_s32(c_lo[12], c_hi[12]); + out[22] = add_round_shift_s32(c_lo[13], c_hi[13]); + out[14] = add_round_shift_s32(c_lo[14], c_hi[14]); + out[30] = add_round_shift_s32(c_lo[15], c_hi[15]); + + BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31); + out[1] = add_round_shift_s32(d_lo[1], d_hi[1]); + out[31] = add_round_shift_s32(d_lo[31], d_hi[31]); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15); + out[17] = add_round_shift_s32(d_lo[17], d_hi[17]); + out[15] = add_round_shift_s32(d_lo[15], d_hi[15]); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23); + out[9] = add_round_shift_s32(d_lo[9], d_hi[9]); + out[23] = add_round_shift_s32(d_lo[23], d_hi[23]); + + BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7); + out[25] = add_round_shift_s32(d_lo[25], d_hi[25]); + out[7] = add_round_shift_s32(d_lo[7], d_hi[7]); + + BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27); + out[5] = add_round_shift_s32(d_lo[5], d_hi[5]); + out[27] = add_round_shift_s32(d_lo[27], d_hi[27]); + + BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11); + out[21] = add_round_shift_s32(d_lo[21], d_hi[21]); + out[11] = add_round_shift_s32(d_lo[11], d_hi[11]); + + BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19); + out[13] = add_round_shift_s32(d_lo[13], d_hi[13]); + out[19] = add_round_shift_s32(d_lo[19], d_hi[19]); + + BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3); + out[29] = add_round_shift_s32(d_lo[29], d_hi[29]); + out[3] = add_round_shift_s32(d_lo[3], d_hi[3]); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t one = vdupq_n_s16(1); + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); +} + +static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); + b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); + b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); + b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); + b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); + b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); + b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); + b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); + + b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); + b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); + b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); + b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); + b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); + b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); + b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); + b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); + + b[16] = add_round_shift_s16(a[16]); + b[17] = add_round_shift_s16(a[17]); + b[18] = add_round_shift_s16(a[18]); + b[19] = add_round_shift_s16(a[19]); + + butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); + b[20] = add_round_shift_s16(b[20]); + b[21] = add_round_shift_s16(b[21]); + b[22] = add_round_shift_s16(b[22]); + b[23] = add_round_shift_s16(b[23]); + b[24] = add_round_shift_s16(b[24]); + b[25] = add_round_shift_s16(b[25]); + b[26] = add_round_shift_s16(b[26]); + b[27] = add_round_shift_s16(b[27]); + + b[28] = add_round_shift_s16(a[28]); + b[29] = add_round_shift_s16(a[29]); + b[30] = add_round_shift_s16(a[30]); + b[31] = add_round_shift_s16(a[31]); + + // Stage 3. + a[0] = vaddq_s16(b[0], b[7]); + a[1] = vaddq_s16(b[1], b[6]); + a[2] = vaddq_s16(b[2], b[5]); + a[3] = vaddq_s16(b[3], b[4]); + + a[4] = vsubq_s16(b[3], b[4]); + a[5] = vsubq_s16(b[2], b[5]); + a[6] = vsubq_s16(b[1], b[6]); + a[7] = vsubq_s16(b[0], b[7]); + + a[8] = b[8]; + a[9] = b[9]; + + butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]); + butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]); + + a[14] = b[14]; + a[15] = b[15]; + + a[16] = vaddq_s16(b[16], b[23]); + a[17] = vaddq_s16(b[17], b[22]); + a[18] = vaddq_s16(b[18], b[21]); + a[19] = vaddq_s16(b[19], b[20]); + + a[20] = vsubq_s16(b[19], b[20]); + a[21] = vsubq_s16(b[18], b[21]); + a[22] = vsubq_s16(b[17], b[22]); + a[23] = vsubq_s16(b[16], b[23]); + + a[24] = vsubq_s16(b[31], b[24]); + a[25] = vsubq_s16(b[30], b[25]); + a[26] = vsubq_s16(b[29], b[26]); + a[27] = vsubq_s16(b[28], b[27]); + + a[28] = vaddq_s16(b[28], b[27]); + a[29] = vaddq_s16(b[29], b[26]); + a[30] = vaddq_s16(b[30], b[25]); + a[31] = vaddq_s16(b[31], b[24]); + + // Stage 4. + b[0] = vaddq_s16(a[0], a[3]); + b[1] = vaddq_s16(a[1], a[2]); + b[2] = vsubq_s16(a[1], a[2]); + b[3] = vsubq_s16(a[0], a[3]); + + b[4] = a[4]; + + butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]); + + b[7] = a[7]; + + b[8] = vaddq_s16(a[8], a[11]); + b[9] = vaddq_s16(a[9], a[10]); + b[10] = vsubq_s16(a[9], a[10]); + b[11] = vsubq_s16(a[8], a[11]); + b[12] = vsubq_s16(a[15], a[12]); + b[13] = vsubq_s16(a[14], a[13]); + b[14] = vaddq_s16(a[14], a[13]); + b[15] = vaddq_s16(a[15], a[12]); + + b[16] = a[16]; + b[17] = a[17]; + + butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]); + butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]); + butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]); + butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]); + + b[22] = a[22]; + b[23] = a[23]; + b[24] = a[24]; + b[25] = a[25]; + + b[30] = a[30]; + b[31] = a[31]; + + // Stage 5. + butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]); + butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]); + + a[4] = vaddq_s16(b[4], b[5]); + a[5] = vsubq_s16(b[4], b[5]); + a[6] = vsubq_s16(b[7], b[6]); + a[7] = vaddq_s16(b[7], b[6]); + + a[8] = b[8]; + + butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]); + butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]); + + a[11] = b[11]; + a[12] = b[12]; + + a[15] = b[15]; + + a[16] = vaddq_s16(b[19], b[16]); + a[17] = vaddq_s16(b[18], b[17]); + a[18] = vsubq_s16(b[17], b[18]); + a[19] = vsubq_s16(b[16], b[19]); + a[20] = vsubq_s16(b[23], b[20]); + a[21] = vsubq_s16(b[22], b[21]); + a[22] = vaddq_s16(b[21], b[22]); + a[23] = vaddq_s16(b[20], b[23]); + a[24] = vaddq_s16(b[27], b[24]); + a[25] = vaddq_s16(b[26], b[25]); + a[26] = vsubq_s16(b[25], b[26]); + a[27] = vsubq_s16(b[24], b[27]); + a[28] = vsubq_s16(b[31], b[28]); + a[29] = vsubq_s16(b[30], b[29]); + a[30] = vaddq_s16(b[29], b[30]); + a[31] = vaddq_s16(b[28], b[31]); + + // Stage 6. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + + butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]); + butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]); + + b[8] = vaddq_s16(a[8], a[9]); + b[9] = vsubq_s16(a[8], a[9]); + b[10] = vsubq_s16(a[11], a[10]); + b[11] = vaddq_s16(a[11], a[10]); + b[12] = vaddq_s16(a[12], a[13]); + b[13] = vsubq_s16(a[12], a[13]); + b[14] = vsubq_s16(a[15], a[14]); + b[15] = vaddq_s16(a[15], a[14]); + + b[16] = a[16]; + b[19] = a[19]; + b[20] = a[20]; + b[23] = a[23]; + b[24] = a[24]; + b[27] = a[27]; + b[28] = a[28]; + b[31] = a[31]; + + butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]); + butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]); + + butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]); + butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]); + + // Stage 7. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = b[4]; + a[5] = b[5]; + a[6] = b[6]; + a[7] = b[7]; + + butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]); + butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]); + butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]); + butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]); + + a[16] = vaddq_s16(b[16], b[17]); + a[17] = vsubq_s16(b[16], b[17]); + a[18] = vsubq_s16(b[19], b[18]); + a[19] = vaddq_s16(b[19], b[18]); + a[20] = vaddq_s16(b[20], b[21]); + a[21] = vsubq_s16(b[20], b[21]); + a[22] = vsubq_s16(b[23], b[22]); + a[23] = vaddq_s16(b[23], b[22]); + a[24] = vaddq_s16(b[24], b[25]); + a[25] = vsubq_s16(b[24], b[25]); + a[26] = vsubq_s16(b[27], b[26]); + a[27] = vaddq_s16(b[27], b[26]); + a[28] = vaddq_s16(b[28], b[29]); + a[29] = vsubq_s16(b[28], b[29]); + a[30] = vsubq_s16(b[31], b[30]); + a[31] = vaddq_s16(b[31], b[30]); + + // Final stage. + out[0] = a[0]; + out[16] = a[1]; + out[8] = a[2]; + out[24] = a[3]; + out[4] = a[4]; + out[20] = a[5]; + out[12] = a[6]; + out[28] = a[7]; + out[2] = a[8]; + out[18] = a[9]; + out[10] = a[10]; + out[26] = a[11]; + out[6] = a[12]; + out[22] = a[13]; + out[14] = a[14]; + out[30] = a[15]; + + butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]); + butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17], + &out[15]); + butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]); + butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]); + butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]); + butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21], + &out[11]); + butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13], + &out[19]); + butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]); +} + +#undef PASS_THROUGH +#undef ADD_S16_S32 +#undef SUB_S16_S32 +#undef ADDW_S16_S32 +#undef SUBW_S16_S32 +#undef ADD_S32 +#undef SUB_S32 +#undef BUTTERFLY_ONE_S16_S32 +#undef BUTTERFLY_ONE_S32 +#undef BUTTERFLY_TWO_S32 + +// Transpose 8x8 to a new location. Don't use transpose_neon.h because those +// are all in-place. +// TODO(johannkoenig): share with other fdcts. +static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + + // Process in 8x32 columns. + load(input, stride, temp0); + dct_body_first_pass(temp0, temp1); + + load(input + 8, stride, temp0); + dct_body_first_pass(temp0, temp2); + + load(input + 16, stride, temp0); + dct_body_first_pass(temp0, temp3); + + load(input + 24, stride, temp0); + dct_body_first_pass(temp0, temp4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_8x8(&temp1[0], &temp0[0]); + transpose_8x8(&temp2[0], &temp0[8]); + transpose_8x8(&temp3[0], &temp0[16]); + transpose_8x8(&temp4[0], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output, temp5); + + // Second row of 8x32. + transpose_8x8(&temp1[8], &temp0[0]); + transpose_8x8(&temp2[8], &temp0[8]); + transpose_8x8(&temp3[8], &temp0[16]); + transpose_8x8(&temp4[8], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 8 * 32, temp5); + + // Third row of 8x32 + transpose_8x8(&temp1[16], &temp0[0]); + transpose_8x8(&temp2[16], &temp0[8]); + transpose_8x8(&temp3[16], &temp0[16]); + transpose_8x8(&temp4[16], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 16 * 32, temp5); + + // Final row of 8x32. + transpose_8x8(&temp1[24], &temp0[0]); + transpose_8x8(&temp2[24], &temp0[8]); + transpose_8x8(&temp3[24], &temp0[16]); + transpose_8x8(&temp4[24], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 24 * 32, temp5); +} + +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + + // Process in 8x32 columns. + load(input, stride, temp0); + dct_body_first_pass(temp0, temp1); + + load(input + 8, stride, temp0); + dct_body_first_pass(temp0, temp2); + + load(input + 16, stride, temp0); + dct_body_first_pass(temp0, temp3); + + load(input + 24, stride, temp0); + dct_body_first_pass(temp0, temp4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_8x8(&temp1[0], &temp0[0]); + transpose_8x8(&temp2[0], &temp0[8]); + transpose_8x8(&temp3[0], &temp0[16]); + transpose_8x8(&temp4[0], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output, temp5); + + // Second row of 8x32. + transpose_8x8(&temp1[8], &temp0[0]); + transpose_8x8(&temp2[8], &temp0[8]); + transpose_8x8(&temp3[8], &temp0[16]); + transpose_8x8(&temp4[8], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 8 * 32, temp5); + + // Third row of 8x32 + transpose_8x8(&temp1[16], &temp0[0]); + transpose_8x8(&temp2[16], &temp0[8]); + transpose_8x8(&temp3[16], &temp0[16]); + transpose_8x8(&temp4[16], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 16 * 32, temp5); + + // Final row of 8x32. + transpose_8x8(&temp1[24], &temp0[0]); + transpose_8x8(&temp2[24], &temp0[8]); + transpose_8x8(&temp3[24], &temp0[16]); + transpose_8x8(&temp4[24], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 24 * 32, temp5); +} +#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && + // __GNUC__ == 4 && __GNUC_MINOR__ <= 9 diff --git a/libs/libvpx/vpx_dsp/arm/fdct_neon.c b/libs/libvpx/vpx_dsp/arm/fdct_neon.c new file mode 100644 index 0000000000..04646ed2e0 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/fdct_neon.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + int i; + // input[M * stride] * 16 + int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); + input_0 = vadd_s16(input_0, one); + } + + for (i = 0; i < 2; ++i) { + const int16x8_t input_01 = vcombine_s16(input_0, input_1); + const int16x8_t input_32 = vcombine_s16(input_3, input_2); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // (s_0 +/- s_1) * cospi_16_64 + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); + const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); + const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); + const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); + + // fdct_round_shift + int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); + int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); + const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); + + const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); + const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); + + // fdct_round_shift + int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); + int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); + + transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + + input_0 = out_0; + input_1 = out_1; + input_2 = out_2; + input_3 = out_3; + } + + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(input_0, input_1); + int16x8_t out_23 = vcombine_s16(input_2, input_3); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + store_s16q_to_tran_low(final_output + 0 * 8, out_01); + store_s16q_to_tran_low(final_output + 1 * 8, out_23); + } +} diff --git a/libs/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libs/libvpx/vpx_dsp/arm/fdct_partial_neon.c new file mode 100644 index 0000000000..e73de41d77 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/fdct_partial_neon.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE tran_low_t get_lane(const int32x2_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + return vget_lane_s32(a, 0); +#else + return vget_lane_s16(vreinterpret_s16_s32(a), 0); +#endif // CONFIG_VP9_HIGHBITDETPH +} + +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x4_t a0, a1, a2, a3; + int16x8_t b0, b1; + int16x8_t c; + int32x2_t d; + + a0 = vld1_s16(input); + input += stride; + a1 = vld1_s16(input); + input += stride; + a2 = vld1_s16(input); + input += stride; + a3 = vld1_s16(input); + + b0 = vcombine_s16(a0, a1); + b1 = vcombine_s16(a2, a3); + + c = vaddq_s16(b0, b1); + + d = horizontal_add_int16x8(c); + + output[0] = get_lane(vshl_n_s32(d, 1)); + output[1] = 0; +} + +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + + output[0] = get_lane(horizontal_add_int16x8(sum)); + output[1] = 0; +} + +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t left = vld1q_s16(input); + int16x8_t right = vld1q_s16(input + 8); + int32x2_t sum; + input += stride; + + for (r = 1; r < 16; ++r) { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + left = vaddq_s16(left, a); + right = vaddq_s16(right, b); + } + + sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right)); + + output[0] = get_lane(vshr_n_s32(sum, 1)); + output[1] = 0; +} + +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t a0 = vld1q_s16(input); + int16x8_t a1 = vld1q_s16(input + 8); + int16x8_t a2 = vld1q_s16(input + 16); + int16x8_t a3 = vld1q_s16(input + 24); + int32x2_t sum; + input += stride; + + for (r = 1; r < 32; ++r) { + const int16x8_t b0 = vld1q_s16(input); + const int16x8_t b1 = vld1q_s16(input + 8); + const int16x8_t b2 = vld1q_s16(input + 16); + const int16x8_t b3 = vld1q_s16(input + 24); + input += stride; + a0 = vaddq_s16(a0, b0); + a1 = vaddq_s16(a1, b1); + a2 = vaddq_s16(a2, b2); + a3 = vaddq_s16(a3, b3); + } + + sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1)); + sum = vadd_s32(sum, horizontal_add_int16x8(a2)); + sum = vadd_s32(sum, horizontal_add_int16x8(a3)); + output[0] = get_lane(vshr_n_s32(sum, 3)); + output[1] = 0; +} diff --git a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c index 7cb2ba90d2..8049277b13 100644 --- a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c @@ -12,8 +12,12 @@ #include "./vpx_config.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" -void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { int i; // stage 1 int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); @@ -44,14 +48,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); @@ -73,10 +77,10 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { // Stage 2 v_x0 = vsubq_s16(v_s6, v_s5); v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); @@ -91,22 +95,22 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { v_x3 = vaddq_s16(v_s7, cd); } // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); @@ -122,6 +126,8 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 } // transpose 8x8 + // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 + // columns. { // 00 01 02 03 40 41 42 43 // 10 11 12 13 50 51 52 53 @@ -191,30 +197,13 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { input_6 = vhsubq_s16(input_6, sign_in6); input_7 = vhsubq_s16(input_7, sign_in7); // store results - vst1q_s16(&final_output[0 * 8], input_0); - vst1q_s16(&final_output[1 * 8], input_1); - vst1q_s16(&final_output[2 * 8], input_2); - vst1q_s16(&final_output[3 * 8], input_3); - vst1q_s16(&final_output[4 * 8], input_4); - vst1q_s16(&final_output[5 * 8], input_5); - vst1q_s16(&final_output[6 * 8], input_6); - vst1q_s16(&final_output[7 * 8], input_7); - } -} - -void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { - int r; - int16x8_t sum = vld1q_s16(&input[0]); - for (r = 1; r < 8; ++r) { - const int16x8_t input_00 = vld1q_s16(&input[r * stride]); - sum = vaddq_s16(sum, input_00); - } - { - const int32x4_t a = vpaddlq_s16(sum); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); - output[1] = 0; + store_s16q_to_tran_low(final_output + 0 * 8, input_0); + store_s16q_to_tran_low(final_output + 1 * 8, input_1); + store_s16q_to_tran_low(final_output + 2 * 8, input_2); + store_s16q_to_tran_low(final_output + 3 * 8, input_3); + store_s16q_to_tran_low(final_output + 4 * 8, input_4); + store_s16q_to_tran_low(final_output + 5 * 8, input_5); + store_s16q_to_tran_low(final_output + 6 * 8, input_6); + store_s16q_to_tran_low(final_output + 7 * 8, input_7); } } diff --git a/libs/libvpx/vpx_dsp/arm/hadamard_neon.c b/libs/libvpx/vpx_dsp/arm/hadamard_neon.c index 977323497a..523a63c6f7 100644 --- a/libs/libvpx/vpx_dsp/arm/hadamard_neon.c +++ b/libs/libvpx/vpx_dsp/arm/hadamard_neon.c @@ -11,6 +11,9 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, @@ -44,8 +47,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, *a7 = vaddq_s16(c1, c5); } -void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a1 = vld1q_s16(src_diff + src_stride); int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); @@ -63,18 +66,18 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, // Skip the second transpose because it is not required. - vst1q_s16(coeff + 0, a0); - vst1q_s16(coeff + 8, a1); - vst1q_s16(coeff + 16, a2); - vst1q_s16(coeff + 24, a3); - vst1q_s16(coeff + 32, a4); - vst1q_s16(coeff + 40, a5); - vst1q_s16(coeff + 48, a6); - vst1q_s16(coeff + 56, a7); + store_s16q_to_tran_low(coeff + 0, a0); + store_s16q_to_tran_low(coeff + 8, a1); + store_s16q_to_tran_low(coeff + 16, a2); + store_s16q_to_tran_low(coeff + 24, a3); + store_s16q_to_tran_low(coeff + 32, a4); + store_s16q_to_tran_low(coeff + 40, a5); + store_s16q_to_tran_low(coeff + 48, a6); + store_s16q_to_tran_low(coeff + 56, a7); } -void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int i; /* Rearrange 16x16 to 8x32 and remove stride. @@ -88,10 +91,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); for (i = 0; i < 64; i += 8) { - const int16x8_t a0 = vld1q_s16(coeff + 0); - const int16x8_t a1 = vld1q_s16(coeff + 64); - const int16x8_t a2 = vld1q_s16(coeff + 128); - const int16x8_t a3 = vld1q_s16(coeff + 192); + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192); const int16x8_t b0 = vhaddq_s16(a0, a1); const int16x8_t b1 = vhsubq_s16(a0, a1); @@ -103,10 +106,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); - vst1q_s16(coeff + 0, c0); - vst1q_s16(coeff + 64, c1); - vst1q_s16(coeff + 128, c2); - vst1q_s16(coeff + 192, c3); + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 64, c1); + store_s16q_to_tran_low(coeff + 128, c2); + store_s16q_to_tran_low(coeff + 192, c3); coeff += 8; } diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c new file mode 100644 index 0000000000..5358839b53 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -0,0 +1,1437 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int32x2x2_t t32[4]; + + t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); + t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); + t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); + t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); + t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS); + t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS); + t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS); + t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS); + d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); + d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); + d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]); + d1->val[1] = vcombine_s32(t32[3].val[0], t32[3].val[1]); +} + +static INLINE void highbd_idct16x16_add_wrap_low_4x2(const int64x2x2_t *const t, + int32x4_t *const d0, + int32x4_t *const d1) { + int32x2x2_t t32[2]; + + t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); + t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); + t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); + t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); + *d0 = vcombine_s32(t32[0].val[0], t32[0].val[1]); + *d1 = vcombine_s32(t32[1].val[0], t32[1].val[1]); +} + +static INLINE int32x4x2_t +highbd_idct16x16_add_wrap_low_8x1(const int64x2x2_t *const t) { + int32x2x2_t t32[2]; + int32x4x2_t d; + + t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); + t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); + t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); + t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); + d.val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); + d.val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); + return d; +} + +static INLINE int32x4_t highbd_idct16x16_add_wrap_low_4x1(const int64x2x2_t t) { + int32x2x2_t t32; + + t32.val[0] = vrshrn_n_s64(t.val[0], DCT_CONST_BITS); + t32.val[1] = vrshrn_n_s64(t.val[1], DCT_CONST_BITS); + return vcombine_s32(t32.val[0], t32.val[1]); +} + +static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_2_30_10_22, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_4_12_20N_28, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_6_26N_14_18N, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_2_30_10_22, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_4_12_20N_28, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_6_26N_14_18N, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_q_kernel( + const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24, + int64x2x2_t *const t) { + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); +} + +static INLINE void highbd_idct_cospi_8_24_d_kernel( + const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24, + int64x2x2_t *const t) { + t[0].val[0] = + vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[1] = + vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[0] = + vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[1] = + vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1), + vget_low_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0), + vget_low_s32(cospi_0_8_16_24), 1); +} + +static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[2]; + + highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); + highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); + t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]); + t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]); + t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]); + t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[2]; + + highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); + t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]); + t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]); + highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[6]; + + t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[3]; + + t[2].val[0] = + vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[1] = + vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); +} + +static INLINE void highbd_idct16x16_add_stage7_dual( + const int32x4x2_t *const step2, int32x4x2_t *const out) { + out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]); + out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]); + out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]); + out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]); + out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]); + out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]); + out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]); + out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]); + out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]); + out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]); + out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]); + out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]); + out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]); + out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]); + out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]); + out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]); + out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]); + out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]); + out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]); + out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]); + out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]); + out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]); + out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]); + out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]); + out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]); + out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]); + out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]); + out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]); + out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]); + out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]); + out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]); + out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]); +} + +static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2, + int32x4_t *const out) { + out[0] = vaddq_s32(step2[0], step2[15]); + out[1] = vaddq_s32(step2[1], step2[14]); + out[2] = vaddq_s32(step2[2], step2[13]); + out[3] = vaddq_s32(step2[3], step2[12]); + out[4] = vaddq_s32(step2[4], step2[11]); + out[5] = vaddq_s32(step2[5], step2[10]); + out[6] = vaddq_s32(step2[6], step2[9]); + out[7] = vaddq_s32(step2[7], step2[8]); + out[8] = vsubq_s32(step2[7], step2[8]); + out[9] = vsubq_s32(step2[6], step2[9]); + out[10] = vsubq_s32(step2[5], step2[10]); + out[11] = vsubq_s32(step2[4], step2[11]); + out[12] = vsubq_s32(step2[3], step2[12]); + out[13] = vsubq_s32(step2[2], step2[13]); + out[14] = vsubq_s32(step2[1], step2[14]); + out[15] = vsubq_s32(step2[0], step2[15]); +} + +static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out, + int32_t *output) { + // Save the result into output + vst1q_s32(output + 0, out[0].val[0]); + vst1q_s32(output + 4, out[0].val[1]); + output += 16; + vst1q_s32(output + 0, out[1].val[0]); + vst1q_s32(output + 4, out[1].val[1]); + output += 16; + vst1q_s32(output + 0, out[2].val[0]); + vst1q_s32(output + 4, out[2].val[1]); + output += 16; + vst1q_s32(output + 0, out[3].val[0]); + vst1q_s32(output + 4, out[3].val[1]); + output += 16; + vst1q_s32(output + 0, out[4].val[0]); + vst1q_s32(output + 4, out[4].val[1]); + output += 16; + vst1q_s32(output + 0, out[5].val[0]); + vst1q_s32(output + 4, out[5].val[1]); + output += 16; + vst1q_s32(output + 0, out[6].val[0]); + vst1q_s32(output + 4, out[6].val[1]); + output += 16; + vst1q_s32(output + 0, out[7].val[0]); + vst1q_s32(output + 4, out[7].val[1]); + output += 16; + vst1q_s32(output + 0, out[8].val[0]); + vst1q_s32(output + 4, out[8].val[1]); + output += 16; + vst1q_s32(output + 0, out[9].val[0]); + vst1q_s32(output + 4, out[9].val[1]); + output += 16; + vst1q_s32(output + 0, out[10].val[0]); + vst1q_s32(output + 4, out[10].val[1]); + output += 16; + vst1q_s32(output + 0, out[11].val[0]); + vst1q_s32(output + 4, out[11].val[1]); + output += 16; + vst1q_s32(output + 0, out[12].val[0]); + vst1q_s32(output + 4, out[12].val[1]); + output += 16; + vst1q_s32(output + 0, out[13].val[0]); + vst1q_s32(output + 4, out[13].val[1]); + output += 16; + vst1q_s32(output + 0, out[14].val[0]); + vst1q_s32(output + 4, out[14].val[1]); + output += 16; + vst1q_s32(output + 0, out[15].val[0]); + vst1q_s32(output + 4, out[15].val[1]); +} + +static void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, + int32_t *output, uint16_t *dest, + const int stride, + const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[16], step1[16], step2[16], out[16]; + + // Load input (16x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 8; + in[8].val[0] = vld1q_s32(input); + in[8].val[1] = vld1q_s32(input + 4); + input += 8; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 8; + in[9].val[0] = vld1q_s32(input); + in[9].val[1] = vld1q_s32(input + 4); + input += 8; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 8; + in[10].val[0] = vld1q_s32(input); + in[10].val[1] = vld1q_s32(input + 4); + input += 8; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 8; + in[11].val[0] = vld1q_s32(input); + in[11].val[1] = vld1q_s32(input + 4); + input += 8; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 8; + in[12].val[0] = vld1q_s32(input); + in[12].val[1] = vld1q_s32(input + 4); + input += 8; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 8; + in[13].val[0] = vld1q_s32(input); + in[13].val[1] = vld1q_s32(input + 4); + input += 8; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 8; + in[14].val[0] = vld1q_s32(input); + in[14].val[1] = vld1q_s32(input + 4); + input += 8; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + input += 8; + in[15].val[0] = vld1q_s32(input); + in[15].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[1] = in[16 / 2]; + step1[2] = in[8 / 2]; + step1[3] = in[24 / 2]; + step1[4] = in[4 / 2]; + step1[5] = in[20 / 2]; + step1[6] = in[12 / 2]; + step1[7] = in[28 / 2]; + step1[8] = in[2 / 2]; + step1[9] = in[18 / 2]; + step1[10] = in[10 / 2]; + step1[11] = in[26 / 2]; + step1[12] = in[6 / 2]; + step1[13] = in[22 / 2]; + step1[14] = in[14 / 2]; + step1[15] = in[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], + &step2[15]); + highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9], + &step2[14]); + highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], + &step2[13]); + highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11], + &step2[12]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], + &step1[7]); + highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], + &step1[6]); + step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]); + step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]); + step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]); + step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]); + step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]); + step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]); + step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]); + step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]); + step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]); + step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]); + step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]); + step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]); + step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]); + step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]); + step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]); + step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]); + + // stage 4 + highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], + &step2[0]); + highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], + &step2[3]); + step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]); + step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]); + step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]); + step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]); + step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]); + step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]); + step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]); + step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]); + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]); + step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]); + step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]); + step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]); + step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]); + step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]); + step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]); + step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]); + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]); + step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]); + step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]); + step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]); + step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]); + step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]); + step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]); + step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]); + step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]); + step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]); + step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]); + step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]); + step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]); + step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]); + step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]); + step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]); + + // stage 6 + step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]); + step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]); + step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]); + step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]); + step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]); + step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]); + step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]); + step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]); + step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]); + step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]); + step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]); + step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]); + step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]); + step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]); + step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]); + step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s, + const int32x2_t coef) { + int64x2x2_t t[2]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0); + return highbd_idct16x16_add_wrap_low_8x1(t); +} + +static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s, + const int32x2_t coef) { + int64x2x2_t t; + + t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0); + t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0); + return highbd_idct16x16_add_wrap_low_4x1(t); +} + +static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s, + const int32x2_t coef) { + int64x2x2_t t[2]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1); + return highbd_idct16x16_add_wrap_low_8x1(t); +} + +static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s, + const int32x2_t coef) { + int64x2x2_t t; + + t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1); + t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1); + return highbd_idct16x16_add_wrap_low_4x1(t); +} + +static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input, + int32_t *output, uint16_t *dest, + const int stride, const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[8], step1[16], step2[16], out[16]; + + // Load input (8x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 16; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 16; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 16; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 16; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 16; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 16; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 16; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[2] = in[8 / 2]; + step1[4] = in[4 / 2]; + step1[6] = in[12 / 2]; + step1[8] = in[2 / 2]; + step1[10] = in[10 / 2]; + step1[12] = in[6 / 2]; + step1[14] = in[14 / 2]; // 0 in pass 1 + + // stage 2 + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + step2[8] = + highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[9] = highbd_idct_cospi_lane1_dual(step1[14], + vget_high_s32(cospi_6_26N_14_18N)); + step2[10] = + highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[13] = + highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22)); + step2[14] = highbd_idct_cospi_lane0_dual(step1[14], + vget_high_s32(cospi_6_26N_14_18N)); + step2[15] = + highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = + highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[5] = + highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28)); + step1[6] = + highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28)); + step1[7] = + highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = highbd_idct_add_dual(step2[8], step2[9]); + step1[9] = highbd_idct_sub_dual(step2[8], step2[9]); + step1[10] = highbd_idct_sub_dual(step2[11], step2[10]); + step1[11] = highbd_idct_add_dual(step2[11], step2[10]); + step1[12] = highbd_idct_add_dual(step2[12], step2[13]); + step1[13] = highbd_idct_sub_dual(step2[12], step2[13]); + step1[14] = highbd_idct_sub_dual(step2[15], step2[14]); + step1[15] = highbd_idct_add_dual(step2[15], step2[14]); + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[2] = + highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24)); + step2[3] = + highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24)); + step2[4] = highbd_idct_add_dual(step1[4], step1[5]); + step2[5] = highbd_idct_sub_dual(step1[4], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[7], step1[6]); + step2[7] = highbd_idct_add_dual(step1[7], step1[6]); + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = highbd_idct_add_dual(step2[0], step2[3]); + step1[1] = highbd_idct_add_dual(step2[1], step2[2]); + step1[2] = highbd_idct_sub_dual(step2[1], step2[2]); + step1[3] = highbd_idct_sub_dual(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = highbd_idct_add_dual(step2[8], step2[11]); + step1[9] = highbd_idct_add_dual(step2[9], step2[10]); + step1[10] = highbd_idct_sub_dual(step2[9], step2[10]); + step1[11] = highbd_idct_sub_dual(step2[8], step2[11]); + step1[12] = highbd_idct_sub_dual(step2[15], step2[12]); + step1[13] = highbd_idct_sub_dual(step2[14], step2[13]); + step1[14] = highbd_idct_add_dual(step2[14], step2[13]); + step1[15] = highbd_idct_add_dual(step2[15], step2[12]); + + // stage 6 + step2[0] = highbd_idct_add_dual(step1[0], step1[7]); + step2[1] = highbd_idct_add_dual(step1[1], step1[6]); + step2[2] = highbd_idct_add_dual(step1[2], step1[5]); + step2[3] = highbd_idct_add_dual(step1[3], step1[4]); + step2[4] = highbd_idct_sub_dual(step1[3], step1[4]); + step2[5] = highbd_idct_sub_dual(step1[2], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[1], step1[6]); + step2[7] = highbd_idct_sub_dual(step1[0], step1[7]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +void vpx_highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int32_t *output) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x4) + in[0] = vld1q_s32(input); + input += 16; + in[1] = vld1q_s32(input); + input += 16; + in[2] = vld1q_s32(input); + input += 16; + in[3] = vld1q_s32(input); + + // Transpose + transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[4] = + highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s32(step2[8], step2[11]); + step1[9] = vaddq_s32(step2[9], step2[10]); + step1[10] = vsubq_s32(step2[9], step2[10]); + step1[11] = vsubq_s32(step2[8], step2[11]); + step1[12] = vsubq_s32(step2[15], step2[12]); + step1[13] = vsubq_s32(step2[14], step2[13]); + step1[14] = vaddq_s32(step2[14], step2[13]); + step1[15] = vaddq_s32(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s32(step1[0], step1[7]); + step2[1] = vaddq_s32(step1[1], step1[6]); + step2[2] = vaddq_s32(step1[2], step1[5]); + step2[3] = vaddq_s32(step1[3], step1[4]); + step2[4] = vsubq_s32(step1[3], step1[4]); + step2[5] = vsubq_s32(step1[2], step1[5]); + step2[6] = vsubq_s32(step1[1], step1[6]); + step2[7] = vsubq_s32(step1[0], step1[7]); + highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7(step2, out); + + // pass 1: save the result into output + vst1q_s32(output, out[0]); + output += 4; + vst1q_s32(output, out[1]); + output += 4; + vst1q_s32(output, out[2]); + output += 4; + vst1q_s32(output, out[3]); + output += 4; + vst1q_s32(output, out[4]); + output += 4; + vst1q_s32(output, out[5]); + output += 4; + vst1q_s32(output, out[6]); + output += 4; + vst1q_s32(output, out[7]); + output += 4; + vst1q_s32(output, out[8]); + output += 4; + vst1q_s32(output, out[9]); + output += 4; + vst1q_s32(output, out[10]); + output += 4; + vst1q_s32(output, out[11]); + output += 4; + vst1q_s32(output, out[12]); + output += 4; + vst1q_s32(output, out[13]); + output += 4; + vst1q_s32(output, out[14]); + output += 4; + vst1q_s32(output, out[15]); +} + +void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, + int32_t *const output, + uint16_t *const dest, + const int stride, const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x8) + in[0].val[0] = vld1q_s32(input); + input += 4; + in[0].val[1] = vld1q_s32(input); + input += 4; + in[1].val[0] = vld1q_s32(input); + input += 4; + in[1].val[1] = vld1q_s32(input); + input += 4; + in[2].val[0] = vld1q_s32(input); + input += 4; + in[2].val[1] = vld1q_s32(input); + input += 4; + in[3].val[0] = vld1q_s32(input); + input += 4; + in[3].val[1] = vld1q_s32(input); + + // Transpose + transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1], + &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = + highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[15] = + highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[4] = + highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[7] = + highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = highbd_idct_add_dual(step2[8], step2[11]); + step1[9] = highbd_idct_add_dual(step2[9], step2[10]); + step1[10] = highbd_idct_sub_dual(step2[9], step2[10]); + step1[11] = highbd_idct_sub_dual(step2[8], step2[11]); + step1[12] = highbd_idct_sub_dual(step2[15], step2[12]); + step1[13] = highbd_idct_sub_dual(step2[14], step2[13]); + step1[14] = highbd_idct_add_dual(step2[14], step2[13]); + step1[15] = highbd_idct_add_dual(step2[15], step2[12]); + + // stage 6 + step2[0] = highbd_idct_add_dual(step1[0], step1[7]); + step2[1] = highbd_idct_add_dual(step1[1], step1[6]); + step2[2] = highbd_idct_add_dual(step1[2], step1[5]); + step2[3] = highbd_idct_add_dual(step1[3], step1[4]); + step2[4] = highbd_idct_sub_dual(step1[3], step1[4]); + step2[5] = highbd_idct_sub_dual(step1[2], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[1], step1[6]); + step2[7] = highbd_idct_sub_dual(step1[0], step1[7]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1); + + // Parallel idct on the lower 8 rows + vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, + stride, 1); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, + bd); + + // Parallel idct on the lower 8 rows + vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, + dest, stride, bd); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, + bd); + + // Parallel idct to get the right 8 columns + vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, + dest + 8, stride, bd); + } +} + +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, + bd); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd); + + // Parallel idct to get the right 8 columns + vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, + stride, bd); + } +} + +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, + stride, bd); + + // Parallel idct to get the right 8 columns + vpx_highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, + dest + 8, stride, bd); + } +} + +static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a0 = vld1q_u16(*dest + 0); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t c0 = vminq_s16(b0, max); + const int16x8_t c1 = vminq_s16(b1, max); + vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0)); + vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); + *dest += stride; +} + +static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a0 = vld1q_u16(*dest + 0); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const uint16x8_t c0 = vqshluq_n_s16(b0, 0); + const uint16x8_t c1 = vqshluq_n_s16(b1, 0); + vst1q_u16(*dest + 0, c0); + vst1q_u16(*dest + 8, c1); + *dest += stride; +} + +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + const int16x8_t dc = vdupq_n_s16(a1); + int i; + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + for (i = 0; i < 4; ++i) { + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + } + } else { + for (i = 0; i < 4; ++i) { + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c new file mode 100644 index 0000000000..96a55c472f --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c @@ -0,0 +1,646 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_from_transformed(const int32_t *const trans_buf, + const int first, const int second, + int32x4x2_t *const q0, + int32x4x2_t *const q1) { + q0->val[0] = vld1q_s32(trans_buf + first * 8); + q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4); + q1->val[0] = vld1q_s32(trans_buf + second * 8); + q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4); +} + +static INLINE void load_from_output(const int32_t *const out, const int first, + const int second, int32x4x2_t *const q0, + int32x4x2_t *const q1) { + q0->val[0] = vld1q_s32(out + first * 32); + q0->val[1] = vld1q_s32(out + first * 32 + 4); + q1->val[0] = vld1q_s32(out + second * 32); + q1->val[1] = vld1q_s32(out + second * 32 + 4); +} + +static INLINE void store_in_output(int32_t *const out, const int first, + const int second, const int32x4x2_t q0, + const int32x4x2_t q1) { + vst1q_s32(out + first * 32, q0.val[0]); + vst1q_s32(out + first * 32 + 4, q0.val[1]); + vst1q_s32(out + second * 32, q1.val[0]); + vst1q_s32(out + second * 32 + 4, q1.val[1]); +} + +static INLINE void highbd_store_combine_results( + uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0, + const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3, + const int16x8_t max) { + int16x8_t o[4]; + uint16x8_t d[4]; + + d[0] = vld1q_u16(p1); + p1 += stride; + d[1] = vld1q_u16(p1); + d[3] = vld1q_u16(p2); + p2 -= stride; + d[2] = vld1q_u16(p2); + + o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6)); + o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6)); + o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6)); + o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6)); + + o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0])); + o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1])); + o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2])); + o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3])); + o[0] = vminq_s16(o[0], max); + o[1] = vminq_s16(o[1], max); + o[2] = vminq_s16(o[2], max); + o[3] = vminq_s16(o[3], max); + d[0] = vqshluq_n_s16(o[0], 0); + d[1] = vqshluq_n_s16(o[1], 0); + d[2] = vqshluq_n_s16(o[2], 0); + d[3] = vqshluq_n_s16(o[3], 0); + + vst1q_u16(p1, d[1]); + p1 -= stride; + vst1q_u16(p1, d[0]); + vst1q_u16(p2, d[2]); + p2 += stride; + vst1q_u16(p2, d[3]); +} + +static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1, + const int32_t first_const, + const int32_t second_const, + int32x4x2_t *const qOut0, + int32x4x2_t *const qOut1) { + int64x2x2_t q[4]; + int32x2_t d[6]; + + // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9. + d[4] = vdup_n_s32(first_const); + d[5] = vdup_n_s32(second_const); + + q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]); + q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]); + q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]); + q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]); + q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]); + q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]); + q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]); + q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]); + + q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]); + q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]); + q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]); + q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]); + q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]); + q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]); + q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]); + q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]); + + qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS)); + qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS)); + qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS)); + qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS)); +} + +static INLINE void load_s32x4q_dual( + const int32_t *in, int32x4x2_t *const s0, int32x4x2_t *const s1, + int32x4x2_t *const s2, int32x4x2_t *const s3, int32x4x2_t *const s4, + int32x4x2_t *const s5, int32x4x2_t *const s6, int32x4x2_t *const s7) { + s0->val[0] = vld1q_s32(in); + s0->val[1] = vld1q_s32(in + 4); + in += 32; + s1->val[0] = vld1q_s32(in); + s1->val[1] = vld1q_s32(in + 4); + in += 32; + s2->val[0] = vld1q_s32(in); + s2->val[1] = vld1q_s32(in + 4); + in += 32; + s3->val[0] = vld1q_s32(in); + s3->val[1] = vld1q_s32(in + 4); + in += 32; + s4->val[0] = vld1q_s32(in); + s4->val[1] = vld1q_s32(in + 4); + in += 32; + s5->val[0] = vld1q_s32(in); + s5->val[1] = vld1q_s32(in + 4); + in += 32; + s6->val[0] = vld1q_s32(in); + s6->val[1] = vld1q_s32(in + 4); + in += 32; + s7->val[0] = vld1q_s32(in); + s7->val[1] = vld1q_s32(in + 4); +} + +static INLINE void transpose_and_store_s32_8x8(int32x4x2_t a0, int32x4x2_t a1, + int32x4x2_t a2, int32x4x2_t a3, + int32x4x2_t a4, int32x4x2_t a5, + int32x4x2_t a6, int32x4x2_t a7, + int32_t **out) { + transpose_s32_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + vst1q_s32(*out, a0.val[0]); + *out += 4; + vst1q_s32(*out, a0.val[1]); + *out += 4; + vst1q_s32(*out, a1.val[0]); + *out += 4; + vst1q_s32(*out, a1.val[1]); + *out += 4; + vst1q_s32(*out, a2.val[0]); + *out += 4; + vst1q_s32(*out, a2.val[1]); + *out += 4; + vst1q_s32(*out, a3.val[0]); + *out += 4; + vst1q_s32(*out, a3.val[1]); + *out += 4; + vst1q_s32(*out, a4.val[0]); + *out += 4; + vst1q_s32(*out, a4.val[1]); + *out += 4; + vst1q_s32(*out, a5.val[0]); + *out += 4; + vst1q_s32(*out, a5.val[1]); + *out += 4; + vst1q_s32(*out, a6.val[0]); + *out += 4; + vst1q_s32(*out, a6.val[1]); + *out += 4; + vst1q_s32(*out, a7.val[0]); + *out += 4; + vst1q_s32(*out, a7.val[1]); + *out += 4; +} + +static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) { + int i; + int32x4x2_t s0, s1, s2, s3, s4, s5, s6, s7; + + for (i = 0; i < 4; i++, input += 8) { + load_s32x4q_dual(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_and_store_s32_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); + } +} + +static INLINE void idct32_bands_end_1st_pass(int32_t *const out, + int32x4x2_t *const q) { + store_in_output(out, 16, 17, q[6], q[7]); + store_in_output(out, 14, 15, q[8], q[9]); + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 30, 31, q[6], q[7]); + store_in_output(out, 0, 1, q[4], q[5]); + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[10], q[1]); + q[3] = highbd_idct_add_dual(q[11], q[0]); + q[4] = highbd_idct_sub_dual(q[11], q[0]); + q[5] = highbd_idct_sub_dual(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 18, 19, q[6], q[7]); + store_in_output(out, 12, 13, q[8], q[9]); + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 28, 29, q[6], q[7]); + store_in_output(out, 2, 3, q[4], q[5]); + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[12], q[1]); + q[3] = highbd_idct_add_dual(q[13], q[0]); + q[4] = highbd_idct_sub_dual(q[13], q[0]); + q[5] = highbd_idct_sub_dual(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 20, 21, q[6], q[7]); + store_in_output(out, 10, 11, q[8], q[9]); + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 26, 27, q[6], q[7]); + store_in_output(out, 4, 5, q[4], q[5]); + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[14], q[1]); + q[3] = highbd_idct_add_dual(q[15], q[0]); + q[4] = highbd_idct_sub_dual(q[15], q[0]); + q[5] = highbd_idct_sub_dual(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 22, 23, q[6], q[7]); + store_in_output(out, 8, 9, q[8], q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 24, 25, q[6], q[7]); + store_in_output(out, 6, 7, q[4], q[5]); +} + +static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out, + uint16_t *const dest, + const int stride, + const int16x8_t max, + int32x4x2_t *const q) { + uint16_t *dest0 = dest + 0 * stride; + uint16_t *dest1 = dest + 31 * stride; + uint16_t *dest2 = dest + 16 * stride; + uint16_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; + + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[10], q[1]); + q[3] = highbd_idct_add_dual(q[11], q[0]); + q[4] = highbd_idct_sub_dual(q[11], q[0]); + q[5] = highbd_idct_sub_dual(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[12], q[1]); + q[3] = highbd_idct_add_dual(q[13], q[0]); + q[4] = highbd_idct_sub_dual(q[13], q[0]); + q[5] = highbd_idct_sub_dual(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[14], q[1]); + q[3] = highbd_idct_add_dual(q[15], q[0]); + q[4] = highbd_idct_sub_dual(q[15], q[0]); + q[5] = highbd_idct_sub_dual(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); +} + +static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, + uint16_t *dst, const int stride, + const int bd) { + int i, idct32_pass_loop; + int32_t trans_buf[32 * 8]; + int32_t pass1[32 * 32]; + int32_t pass2[32 * 32]; + int32_t *out; + int32x4x2_t q[16]; + + for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; + idct32_pass_loop++, input = pass1, out = pass2) { + for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + input += 32 * 8; + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]); + // part of stage 2 + q[4] = highbd_idct_add_dual(q[0], q[1]); + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[6] = highbd_idct_add_dual(q[2], q[3]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + // part of stage 3 + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]); + + // generate 18,19,28,29 + // part of stage 1 + load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]); + // part of stage 2 + q[13] = highbd_idct_sub_dual(q[3], q[2]); + q[3] = highbd_idct_add_dual(q[3], q[2]); + q[14] = highbd_idct_sub_dual(q[1], q[0]); + q[2] = highbd_idct_add_dual(q[1], q[0]); + // part of stage 3 + do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]); + // part of stage 4 + q[8] = highbd_idct_add_dual(q[4], q[2]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[10] = highbd_idct_add_dual(q[7], q[1]); + q[15] = highbd_idct_add_dual(q[6], q[3]); + q[13] = highbd_idct_sub_dual(q[5], q[0]); + q[14] = highbd_idct_sub_dual(q[7], q[1]); + store_in_output(out, 16, 31, q[8], q[15]); + store_in_output(out, 17, 30, q[9], q[10]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]); + store_in_output(out, 29, 18, q[1], q[0]); + // part of stage 4 + q[13] = highbd_idct_sub_dual(q[4], q[2]); + q[14] = highbd_idct_sub_dual(q[6], q[3]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]); + store_in_output(out, 19, 28, q[4], q[6]); + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]); + // part of stage 2 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 3 + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); + + // generate 22,23,24,25 + // part of stage 1 + load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]); + // part of stage 2 + q[14] = highbd_idct_sub_dual(q[4], q[5]); + q[5] = highbd_idct_add_dual(q[4], q[5]); + q[13] = highbd_idct_sub_dual(q[6], q[7]); + q[6] = highbd_idct_add_dual(q[6], q[7]); + // part of stage 3 + do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]); + // part of stage 4 + q[10] = highbd_idct_add_dual(q[7], q[1]); + q[11] = highbd_idct_add_dual(q[5], q[0]); + q[12] = highbd_idct_add_dual(q[6], q[2]); + q[15] = highbd_idct_add_dual(q[4], q[3]); + // part of stage 6 + load_from_output(out, 16, 17, &q[14], &q[13]); + q[8] = highbd_idct_add_dual(q[14], q[11]); + q[9] = highbd_idct_add_dual(q[13], q[10]); + q[13] = highbd_idct_sub_dual(q[13], q[10]); + q[11] = highbd_idct_sub_dual(q[14], q[11]); + store_in_output(out, 17, 16, q[9], q[8]); + load_from_output(out, 30, 31, &q[14], &q[9]); + q[8] = highbd_idct_sub_dual(q[9], q[12]); + q[10] = highbd_idct_add_dual(q[14], q[15]); + q[14] = highbd_idct_sub_dual(q[14], q[15]); + q[12] = highbd_idct_add_dual(q[9], q[12]); + store_in_output(out, 30, 31, q[10], q[12]); + // part of stage 7 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 25, 22, q[14], q[13]); + do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 24, 23, q[14], q[13]); + // part of stage 4 + q[14] = highbd_idct_sub_dual(q[5], q[0]); + q[13] = highbd_idct_sub_dual(q[6], q[2]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]); + q[14] = highbd_idct_sub_dual(q[7], q[1]); + q[13] = highbd_idct_sub_dual(q[4], q[3]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]); + // part of stage 6 + load_from_output(out, 18, 19, &q[14], &q[13]); + q[8] = highbd_idct_add_dual(q[14], q[1]); + q[9] = highbd_idct_add_dual(q[13], q[6]); + q[13] = highbd_idct_sub_dual(q[13], q[6]); + q[1] = highbd_idct_sub_dual(q[14], q[1]); + store_in_output(out, 18, 19, q[8], q[9]); + load_from_output(out, 28, 29, &q[8], &q[9]); + q[14] = highbd_idct_sub_dual(q[8], q[5]); + q[10] = highbd_idct_add_dual(q[8], q[5]); + q[11] = highbd_idct_add_dual(q[9], q[0]); + q[0] = highbd_idct_sub_dual(q[9], q[0]); + store_in_output(out, 28, 29, q[10], q[11]); + // part of stage 7 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 20, 27, q[13], q[14]); + do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]); + store_in_output(out, 21, 26, q[1], q[0]); + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]); + // part of stage 3 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 4 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]); + + // generate 10,11,12,13 + // part of stage 2 + load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]); + // part of stage 3 + q[14] = highbd_idct_sub_dual(q[4], q[5]); + q[5] = highbd_idct_add_dual(q[4], q[5]); + q[13] = highbd_idct_sub_dual(q[6], q[7]); + q[6] = highbd_idct_add_dual(q[6], q[7]); + // part of stage 4 + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]); + // part of stage 5 + q[8] = highbd_idct_add_dual(q[0], q[5]); + q[9] = highbd_idct_add_dual(q[1], q[7]); + q[13] = highbd_idct_sub_dual(q[1], q[7]); + q[14] = highbd_idct_sub_dual(q[3], q[4]); + q[10] = highbd_idct_add_dual(q[3], q[4]); + q[15] = highbd_idct_add_dual(q[2], q[6]); + store_in_output(out, 8, 15, q[8], q[15]); + store_in_output(out, 9, 14, q[9], q[10]); + // part of stage 6 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 13, 10, q[3], q[1]); + q[13] = highbd_idct_sub_dual(q[0], q[5]); + q[14] = highbd_idct_sub_dual(q[2], q[6]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 11, 12, q[1], q[3]); + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); + // part of stage 4 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + + // generate 0,1,2,3 + // part of stage 4 + load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]); + // part of stage 5 + q[4] = highbd_idct_add_dual(q[7], q[6]); + q[7] = highbd_idct_sub_dual(q[7], q[6]); + q[6] = highbd_idct_sub_dual(q[5], q[14]); + q[5] = highbd_idct_add_dual(q[5], q[14]); + // part of stage 6 + q[8] = highbd_idct_add_dual(q[4], q[2]); + q[9] = highbd_idct_add_dual(q[5], q[3]); + q[10] = highbd_idct_add_dual(q[6], q[1]); + q[11] = highbd_idct_add_dual(q[7], q[0]); + q[12] = highbd_idct_sub_dual(q[7], q[0]); + q[13] = highbd_idct_sub_dual(q[6], q[1]); + q[14] = highbd_idct_sub_dual(q[5], q[3]); + q[15] = highbd_idct_sub_dual(q[4], q[2]); + // part of stage 7 + load_from_output(out, 14, 15, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[8], q[1]); + q[3] = highbd_idct_add_dual(q[9], q[0]); + q[4] = highbd_idct_sub_dual(q[9], q[0]); + q[5] = highbd_idct_sub_dual(q[8], q[1]); + load_from_output(out, 16, 17, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, q); + } else { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + idct32_bands_end_2nd_pass(out, dst, stride, max, q); + dst += 8; + } + } + } +} + +void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1); + } else { + vpx_highbd_idct32_32_neon(input, dest, stride, bd); + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c new file mode 100644 index 0000000000..3970a5a861 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c @@ -0,0 +1,756 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_8x8_s32_dual( + const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1, + int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4, + int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) { + in0->val[0] = vld1q_s32(input); + in0->val[1] = vld1q_s32(input + 4); + input += 32; + in1->val[0] = vld1q_s32(input); + in1->val[1] = vld1q_s32(input + 4); + input += 32; + in2->val[0] = vld1q_s32(input); + in2->val[1] = vld1q_s32(input + 4); + input += 32; + in3->val[0] = vld1q_s32(input); + in3->val[1] = vld1q_s32(input + 4); + input += 32; + in4->val[0] = vld1q_s32(input); + in4->val[1] = vld1q_s32(input + 4); + input += 32; + in5->val[0] = vld1q_s32(input); + in5->val[1] = vld1q_s32(input + 4); + input += 32; + in6->val[0] = vld1q_s32(input); + in6->val[1] = vld1q_s32(input + 4); + input += 32; + in7->val[0] = vld1q_s32(input); + in7->val[1] = vld1q_s32(input + 4); +} + +static INLINE void load_4x8_s32_dual(const tran_low_t *input, + int32x4_t *const in0, int32x4_t *const in1, + int32x4_t *const in2, int32x4_t *const in3, + int32x4_t *const in4, int32x4_t *const in5, + int32x4_t *const in6, + int32x4_t *const in7) { + *in0 = vld1q_s32(input); + input += 32; + *in1 = vld1q_s32(input); + input += 32; + *in2 = vld1q_s32(input); + input += 32; + *in3 = vld1q_s32(input); + input += 32; + *in4 = vld1q_s32(input); + input += 32; + *in5 = vld1q_s32(input); + input += 32; + *in6 = vld1q_s32(input); + input += 32; + *in7 = vld1q_s32(input); +} + +// Only for the first pass of the _135_ variant. Since it only uses values from +// the top left 16x16 it can safely assume all the remaining values are 0 and +// skip an awful lot of calculations. In fact, only the first 12 columns make +// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are +// used so it skips any calls to input[12|13|14|15] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 12x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// 0 0 2 5 10 17 25 38 47 62 83 101 121 +// 1 1 4 8 15 22 30 45 58 74 92 112 133 +// 2 3 7 12 18 28 36 52 64 82 102 118 +// 3 6 11 16 23 31 43 60 73 90 109 126 +// 4 9 14 19 29 37 50 65 78 98 116 134 +// 5 13 20 26 35 44 54 72 85 105 123 +// 6 21 27 33 42 53 63 80 94 113 132 +// 7 24 32 39 48 57 71 88 104 120 +// 8 34 40 46 56 68 81 96 111 130 +// 9 41 49 55 67 77 91 107 124 +// 10 51 59 66 76 89 99 119 131 +// 11 61 69 75 87 100 114 129 +// 12 70 79 86 97 108 122 +// 13 84 93 103 110 125 +// 14 98 106 115 127 +// 15 117 128 +static void vpx_highbd_idct32_12_neon(const tran_low_t *const input, + int32_t *output) { + int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + s8[32]; + + load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], + &in[6], &in[7]); + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0], + &in[9].val[1], &in[10].val[0], &in[10].val[1], + &in[11].val[0], &in[11].val[1]); + transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1], + &in[10].val[0], &in[10].val[1], &in[11].val[0], + &in[11].val[1]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); + s2[19] = highbd_idct_add_dual(s1[18], s1[19]); + s2[20] = highbd_idct_add_dual(s1[20], s1[21]); + s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); + s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); + s2[27] = highbd_idct_add_dual(s1[26], s1[27]); + s2[28] = highbd_idct_add_dual(s1[28], s1[29]); + s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); + s3[11] = highbd_idct_add_dual(s2[10], s2[11]); + s3[12] = highbd_idct_add_dual(s2[12], s2[13]); + s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); + + s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, + s2[29], cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, + s2[26], cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); + + s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, + s3[13], cospi_24_64); + + s4[16] = highbd_idct_add_dual(s1[16], s2[19]); + s4[17] = highbd_idct_add_dual(s3[17], s3[18]); + s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); + s4[19] = highbd_idct_sub_dual(s1[16], s2[19]); + s4[20] = highbd_idct_sub_dual(s1[23], s2[20]); + s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); + s4[22] = highbd_idct_add_dual(s3[21], s3[22]); + s4[23] = highbd_idct_add_dual(s2[20], s1[23]); + s4[24] = highbd_idct_add_dual(s1[24], s2[27]); + s4[25] = highbd_idct_add_dual(s3[25], s3[26]); + s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); + s4[27] = highbd_idct_sub_dual(s1[24], s2[27]); + s4[28] = highbd_idct_sub_dual(s1[31], s2[28]); + s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); + s4[30] = highbd_idct_add_dual(s3[29], s3[30]); + s4[31] = highbd_idct_add_dual(s2[28], s1[31]); + + // stage 5 + s5[0] = highbd_idct_add_dual(s4[0], s4[3]); + s5[1] = highbd_idct_add_dual(s4[0], s4[2]); + s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); + s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64); + + s5[8] = highbd_idct_add_dual(s2[8], s3[11]); + s5[9] = highbd_idct_add_dual(s4[9], s4[10]); + s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); + s5[11] = highbd_idct_sub_dual(s2[8], s3[11]); + s5[12] = highbd_idct_sub_dual(s2[15], s3[12]); + s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); + s5[14] = highbd_idct_add_dual(s4[13], s4[14]); + s5[15] = highbd_idct_add_dual(s2[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, + s4[29], cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, + s4[29], cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, + s4[28], cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, + s4[28], cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, + s4[27], cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, + s4[26], cospi_24_64); + + // stage 6 + s6[0] = highbd_idct_add_dual(s5[0], s3[7]); + s6[1] = highbd_idct_add_dual(s5[1], s5[6]); + s6[2] = highbd_idct_add_dual(s5[2], s5[5]); + s6[3] = highbd_idct_add_dual(s5[3], s3[4]); + s6[4] = highbd_idct_sub_dual(s5[3], s3[4]); + s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); + s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); + s6[7] = highbd_idct_sub_dual(s5[0], s3[7]); + + s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); + + s6[16] = highbd_idct_add_dual(s4[16], s4[23]); + s6[17] = highbd_idct_add_dual(s4[17], s4[22]); + s6[18] = highbd_idct_add_dual(s5[18], s5[21]); + s6[19] = highbd_idct_add_dual(s5[19], s5[20]); + s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); + s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); + s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); + s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); + + s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); + s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); + s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); + s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); + s6[28] = highbd_idct_add_dual(s5[27], s5[28]); + s6[29] = highbd_idct_add_dual(s5[26], s5[29]); + s6[30] = highbd_idct_add_dual(s4[25], s4[30]); + s6[31] = highbd_idct_add_dual(s4[24], s4[31]); + + // stage 7 + s7[0] = highbd_idct_add_dual(s6[0], s5[15]); + s7[1] = highbd_idct_add_dual(s6[1], s5[14]); + s7[2] = highbd_idct_add_dual(s6[2], s6[13]); + s7[3] = highbd_idct_add_dual(s6[3], s6[12]); + s7[4] = highbd_idct_add_dual(s6[4], s6[11]); + s7[5] = highbd_idct_add_dual(s6[5], s6[10]); + s7[6] = highbd_idct_add_dual(s6[6], s5[9]); + s7[7] = highbd_idct_add_dual(s6[7], s5[8]); + s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); + s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); + s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); + s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); + s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); + s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); + s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); + s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); + + // final stage + s8[0] = highbd_idct_add_dual(s7[0], s6[31]); + s8[1] = highbd_idct_add_dual(s7[1], s6[30]); + s8[2] = highbd_idct_add_dual(s7[2], s6[29]); + s8[3] = highbd_idct_add_dual(s7[3], s6[28]); + s8[4] = highbd_idct_add_dual(s7[4], s7[27]); + s8[5] = highbd_idct_add_dual(s7[5], s7[26]); + s8[6] = highbd_idct_add_dual(s7[6], s7[25]); + s8[7] = highbd_idct_add_dual(s7[7], s7[24]); + s8[8] = highbd_idct_add_dual(s7[8], s7[23]); + s8[9] = highbd_idct_add_dual(s7[9], s7[22]); + s8[10] = highbd_idct_add_dual(s7[10], s7[21]); + s8[11] = highbd_idct_add_dual(s7[11], s7[20]); + s8[12] = highbd_idct_add_dual(s7[12], s6[19]); + s8[13] = highbd_idct_add_dual(s7[13], s6[18]); + s8[14] = highbd_idct_add_dual(s7[14], s6[17]); + s8[15] = highbd_idct_add_dual(s7[15], s6[16]); + s8[16] = highbd_idct_sub_dual(s7[15], s6[16]); + s8[17] = highbd_idct_sub_dual(s7[14], s6[17]); + s8[18] = highbd_idct_sub_dual(s7[13], s6[18]); + s8[19] = highbd_idct_sub_dual(s7[12], s6[19]); + s8[20] = highbd_idct_sub_dual(s7[11], s7[20]); + s8[21] = highbd_idct_sub_dual(s7[10], s7[21]); + s8[22] = highbd_idct_sub_dual(s7[9], s7[22]); + s8[23] = highbd_idct_sub_dual(s7[8], s7[23]); + s8[24] = highbd_idct_sub_dual(s7[7], s7[24]); + s8[25] = highbd_idct_sub_dual(s7[6], s7[25]); + s8[26] = highbd_idct_sub_dual(s7[5], s7[26]); + s8[27] = highbd_idct_sub_dual(s7[4], s7[27]); + s8[28] = highbd_idct_sub_dual(s7[3], s6[28]); + s8[29] = highbd_idct_sub_dual(s7[2], s6[29]); + s8[30] = highbd_idct_sub_dual(s7[1], s6[30]); + s8[31] = highbd_idct_sub_dual(s7[0], s6[31]); + + vst1q_s32(output + 0, s8[0].val[0]); + vst1q_s32(output + 4, s8[0].val[1]); + output += 16; + vst1q_s32(output + 0, s8[1].val[0]); + vst1q_s32(output + 4, s8[1].val[1]); + output += 16; + vst1q_s32(output + 0, s8[2].val[0]); + vst1q_s32(output + 4, s8[2].val[1]); + output += 16; + vst1q_s32(output + 0, s8[3].val[0]); + vst1q_s32(output + 4, s8[3].val[1]); + output += 16; + vst1q_s32(output + 0, s8[4].val[0]); + vst1q_s32(output + 4, s8[4].val[1]); + output += 16; + vst1q_s32(output + 0, s8[5].val[0]); + vst1q_s32(output + 4, s8[5].val[1]); + output += 16; + vst1q_s32(output + 0, s8[6].val[0]); + vst1q_s32(output + 4, s8[6].val[1]); + output += 16; + vst1q_s32(output + 0, s8[7].val[0]); + vst1q_s32(output + 4, s8[7].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[8].val[0]); + vst1q_s32(output + 4, s8[8].val[1]); + output += 16; + vst1q_s32(output + 0, s8[9].val[0]); + vst1q_s32(output + 4, s8[9].val[1]); + output += 16; + vst1q_s32(output + 0, s8[10].val[0]); + vst1q_s32(output + 4, s8[10].val[1]); + output += 16; + vst1q_s32(output + 0, s8[11].val[0]); + vst1q_s32(output + 4, s8[11].val[1]); + output += 16; + vst1q_s32(output + 0, s8[12].val[0]); + vst1q_s32(output + 4, s8[12].val[1]); + output += 16; + vst1q_s32(output + 0, s8[13].val[0]); + vst1q_s32(output + 4, s8[13].val[1]); + output += 16; + vst1q_s32(output + 0, s8[14].val[0]); + vst1q_s32(output + 4, s8[14].val[1]); + output += 16; + vst1q_s32(output + 0, s8[15].val[0]); + vst1q_s32(output + 4, s8[15].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[16].val[0]); + vst1q_s32(output + 4, s8[16].val[1]); + output += 16; + vst1q_s32(output + 0, s8[17].val[0]); + vst1q_s32(output + 4, s8[17].val[1]); + output += 16; + vst1q_s32(output + 0, s8[18].val[0]); + vst1q_s32(output + 4, s8[18].val[1]); + output += 16; + vst1q_s32(output + 0, s8[19].val[0]); + vst1q_s32(output + 4, s8[19].val[1]); + output += 16; + vst1q_s32(output + 0, s8[20].val[0]); + vst1q_s32(output + 4, s8[20].val[1]); + output += 16; + vst1q_s32(output + 0, s8[21].val[0]); + vst1q_s32(output + 4, s8[21].val[1]); + output += 16; + vst1q_s32(output + 0, s8[22].val[0]); + vst1q_s32(output + 4, s8[22].val[1]); + output += 16; + vst1q_s32(output + 0, s8[23].val[0]); + vst1q_s32(output + 4, s8[23].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[24].val[0]); + vst1q_s32(output + 4, s8[24].val[1]); + output += 16; + vst1q_s32(output + 0, s8[25].val[0]); + vst1q_s32(output + 4, s8[25].val[1]); + output += 16; + vst1q_s32(output + 0, s8[26].val[0]); + vst1q_s32(output + 4, s8[26].val[1]); + output += 16; + vst1q_s32(output + 0, s8[27].val[0]); + vst1q_s32(output + 4, s8[27].val[1]); + output += 16; + vst1q_s32(output + 0, s8[28].val[0]); + vst1q_s32(output + 4, s8[28].val[1]); + output += 16; + vst1q_s32(output + 0, s8[29].val[0]); + vst1q_s32(output + 4, s8[29].val[1]); + output += 16; + vst1q_s32(output + 0, s8[30].val[0]); + vst1q_s32(output + 4, s8[30].val[1]); + output += 16; + vst1q_s32(output + 0, s8[31].val[0]); + vst1q_s32(output + 4, s8[31].val[1]); +} + +static void vpx_highbd_idct32_16_neon(const int32_t *const input, + uint16_t *const output, const int stride, + const int bd) { + int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + out[32]; + + load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11], + &in[12], &in[13], &in[14], &in[15]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64); + s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64); + + s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); + + s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64); + s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64); + s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64); + + s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + s2[16] = highbd_idct_add_dual(s1[16], s1[17]); + s2[17] = highbd_idct_sub_dual(s1[16], s1[17]); + s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); + s2[19] = highbd_idct_add_dual(s1[18], s1[19]); + s2[20] = highbd_idct_add_dual(s1[20], s1[21]); + s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); + s2[22] = highbd_idct_sub_dual(s1[23], s1[22]); + s2[23] = highbd_idct_add_dual(s1[22], s1[23]); + s2[24] = highbd_idct_add_dual(s1[24], s1[25]); + s2[25] = highbd_idct_sub_dual(s1[24], s1[25]); + s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); + s2[27] = highbd_idct_add_dual(s1[26], s1[27]); + s2[28] = highbd_idct_add_dual(s1[28], s1[29]); + s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); + s2[30] = highbd_idct_sub_dual(s1[31], s1[30]); + s2[31] = highbd_idct_add_dual(s1[30], s1[31]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64); + s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64); + + s3[8] = highbd_idct_add_dual(s2[8], s2[9]); + s3[9] = highbd_idct_sub_dual(s2[8], s2[9]); + s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); + s3[11] = highbd_idct_add_dual(s2[10], s2[11]); + s3[12] = highbd_idct_add_dual(s2[12], s2[13]); + s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); + s3[14] = highbd_idct_sub_dual(s2[15], s2[14]); + s3[15] = highbd_idct_add_dual(s2[14], s2[15]); + + s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64, + s2[30], cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64, + s2[30], cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, + s2[29], cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, + s2[26], cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64, + s2[25], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64, + s2[25], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); + + s4[4] = highbd_idct_add_dual(s3[4], s3[5]); + s4[5] = highbd_idct_sub_dual(s3[4], s3[5]); + s4[6] = highbd_idct_sub_dual(s3[7], s3[6]); + s4[7] = highbd_idct_add_dual(s3[6], s3[7]); + + s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64, + s3[14], cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64, + s3[14], cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, + s3[13], cospi_24_64); + + s4[16] = highbd_idct_add_dual(s2[16], s2[19]); + s4[17] = highbd_idct_add_dual(s3[17], s3[18]); + s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); + s4[19] = highbd_idct_sub_dual(s2[16], s2[19]); + s4[20] = highbd_idct_sub_dual(s2[23], s2[20]); + s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); + s4[22] = highbd_idct_add_dual(s3[21], s3[22]); + s4[23] = highbd_idct_add_dual(s2[20], s2[23]); + s4[24] = highbd_idct_add_dual(s2[24], s2[27]); + s4[25] = highbd_idct_add_dual(s3[25], s3[26]); + s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); + s4[27] = highbd_idct_sub_dual(s2[24], s2[27]); + s4[28] = highbd_idct_sub_dual(s2[31], s2[28]); + s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); + s4[30] = highbd_idct_add_dual(s3[29], s3[30]); + s4[31] = highbd_idct_add_dual(s2[28], s2[31]); + + // stage 5 + s5[0] = highbd_idct_add_dual(s4[0], s4[3]); + s5[1] = highbd_idct_add_dual(s4[0], s4[2]); + s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); + s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64); + + s5[8] = highbd_idct_add_dual(s3[8], s3[11]); + s5[9] = highbd_idct_add_dual(s4[9], s4[10]); + s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); + s5[11] = highbd_idct_sub_dual(s3[8], s3[11]); + s5[12] = highbd_idct_sub_dual(s3[15], s3[12]); + s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); + s5[14] = highbd_idct_add_dual(s4[13], s4[14]); + s5[15] = highbd_idct_add_dual(s3[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, + s4[29], cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, + s4[29], cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, + s4[28], cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, + s4[28], cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, + s4[27], cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, + s4[26], cospi_24_64); + + // stage 6 + s6[0] = highbd_idct_add_dual(s5[0], s4[7]); + s6[1] = highbd_idct_add_dual(s5[1], s5[6]); + s6[2] = highbd_idct_add_dual(s5[2], s5[5]); + s6[3] = highbd_idct_add_dual(s5[3], s4[4]); + s6[4] = highbd_idct_sub_dual(s5[3], s4[4]); + s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); + s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); + s6[7] = highbd_idct_sub_dual(s5[0], s4[7]); + + s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); + + s6[16] = highbd_idct_add_dual(s4[16], s4[23]); + s6[17] = highbd_idct_add_dual(s4[17], s4[22]); + s6[18] = highbd_idct_add_dual(s5[18], s5[21]); + s6[19] = highbd_idct_add_dual(s5[19], s5[20]); + s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); + s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); + s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); + s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); + s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); + s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); + s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); + s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); + s6[28] = highbd_idct_add_dual(s5[27], s5[28]); + s6[29] = highbd_idct_add_dual(s5[26], s5[29]); + s6[30] = highbd_idct_add_dual(s4[25], s4[30]); + s6[31] = highbd_idct_add_dual(s4[24], s4[31]); + + // stage 7 + s7[0] = highbd_idct_add_dual(s6[0], s5[15]); + s7[1] = highbd_idct_add_dual(s6[1], s5[14]); + s7[2] = highbd_idct_add_dual(s6[2], s6[13]); + s7[3] = highbd_idct_add_dual(s6[3], s6[12]); + s7[4] = highbd_idct_add_dual(s6[4], s6[11]); + s7[5] = highbd_idct_add_dual(s6[5], s6[10]); + s7[6] = highbd_idct_add_dual(s6[6], s5[9]); + s7[7] = highbd_idct_add_dual(s6[7], s5[8]); + s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); + s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); + s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); + s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); + s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); + s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); + s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); + s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); + + // final stage + out[0] = highbd_idct_add_dual(s7[0], s6[31]); + out[1] = highbd_idct_add_dual(s7[1], s6[30]); + out[2] = highbd_idct_add_dual(s7[2], s6[29]); + out[3] = highbd_idct_add_dual(s7[3], s6[28]); + out[4] = highbd_idct_add_dual(s7[4], s7[27]); + out[5] = highbd_idct_add_dual(s7[5], s7[26]); + out[6] = highbd_idct_add_dual(s7[6], s7[25]); + out[7] = highbd_idct_add_dual(s7[7], s7[24]); + out[8] = highbd_idct_add_dual(s7[8], s7[23]); + out[9] = highbd_idct_add_dual(s7[9], s7[22]); + out[10] = highbd_idct_add_dual(s7[10], s7[21]); + out[11] = highbd_idct_add_dual(s7[11], s7[20]); + out[12] = highbd_idct_add_dual(s7[12], s6[19]); + out[13] = highbd_idct_add_dual(s7[13], s6[18]); + out[14] = highbd_idct_add_dual(s7[14], s6[17]); + out[15] = highbd_idct_add_dual(s7[15], s6[16]); + out[16] = highbd_idct_sub_dual(s7[15], s6[16]); + out[17] = highbd_idct_sub_dual(s7[14], s6[17]); + out[18] = highbd_idct_sub_dual(s7[13], s6[18]); + out[19] = highbd_idct_sub_dual(s7[12], s6[19]); + out[20] = highbd_idct_sub_dual(s7[11], s7[20]); + out[21] = highbd_idct_sub_dual(s7[10], s7[21]); + out[22] = highbd_idct_sub_dual(s7[9], s7[22]); + out[23] = highbd_idct_sub_dual(s7[8], s7[23]); + out[24] = highbd_idct_sub_dual(s7[7], s7[24]); + out[25] = highbd_idct_sub_dual(s7[6], s7[25]); + out[26] = highbd_idct_sub_dual(s7[5], s7[26]); + out[27] = highbd_idct_sub_dual(s7[4], s7[27]); + out[28] = highbd_idct_sub_dual(s7[3], s6[28]); + out[29] = highbd_idct_sub_dual(s7[2], s6[29]); + out[30] = highbd_idct_sub_dual(s7[1], s6[30]); + out[31] = highbd_idct_sub_dual(s7[0], s6[31]); + + highbd_idct16x16_add_store(out, output, stride, bd); + highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); +} + +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + + if (bd == 8) { + int16_t temp[32 * 16]; + int16_t *t = temp; + vpx_idct32_12_neon(input, temp); + vpx_idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_16_neon(t, dest, stride, 1); + t += (16 * 8); + dest += 8; + } + } else { + int32_t temp[32 * 16]; + int32_t *t = temp; + vpx_highbd_idct32_12_neon(input, temp); + vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + vpx_highbd_idct32_16_neon(t, dest, stride, bd); + t += (16 * 8); + dest += 8; + } + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c new file mode 100644 index 0000000000..5d9063b15d --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +// Only for the first pass of the _34_ variant. Since it only uses values from +// the top left 8x8 it can safely assume all the remaining values are 0 and skip +// an awful lot of calculations. In fact, only the first 6 columns make the cut. +// None of the elements in the 7th or 8th column are used so it skips any calls +// to input[67] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 8x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 +// 0 0 2 5 10 17 25 +// 1 1 4 8 15 22 30 +// 2 3 7 12 18 28 +// 3 6 11 16 23 31 +// 4 9 14 19 29 +// 5 13 20 26 +// 6 21 27 33 +// 7 24 32 +static void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) { + int32x4x2_t in[8], s1[32], s2[32], s3[32]; + + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 32; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 32; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 32; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 32; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 32; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 32; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 32; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64, + s1[27], cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s2[20] = highbd_idct_sub_dual(s1[23], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[22], s1[21]); + s2[22] = highbd_idct_add_dual(s1[21], s1[22]); + s2[23] = highbd_idct_add_dual(s1[20], s1[23]); + s2[24] = highbd_idct_add_dual(s1[24], s1[27]); + s2[25] = highbd_idct_add_dual(s1[25], s1[26]); + s2[26] = highbd_idct_sub_dual(s1[25], s1[26]); + s2[27] = highbd_idct_sub_dual(s1[24], s1[27]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64); + + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64, + s1[30], cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64, + s1[30], cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64, + s1[31], cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64, + s1[31], cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64, + s2[27], cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64, + s2[26], cospi_24_64); + + // stage 6 + s2[0] = highbd_idct_add_dual(s1[0], s1[7]); + s2[1] = highbd_idct_add_dual(s1[0], s1[6]); + s2[2] = highbd_idct_add_dual(s1[0], s1[5]); + s2[3] = highbd_idct_add_dual(s1[0], s1[4]); + s2[4] = highbd_idct_sub_dual(s1[0], s1[4]); + s2[5] = highbd_idct_sub_dual(s1[0], s1[5]); + s2[6] = highbd_idct_sub_dual(s1[0], s1[6]); + s2[7] = highbd_idct_sub_dual(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64); + + s2[16] = highbd_idct_add_dual(s1[16], s2[23]); + s2[17] = highbd_idct_add_dual(s1[17], s2[22]); + s2[18] = highbd_idct_add_dual(s1[18], s1[21]); + s2[19] = highbd_idct_add_dual(s1[19], s1[20]); + s2[20] = highbd_idct_sub_dual(s1[19], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[18], s1[21]); + s2[22] = highbd_idct_sub_dual(s1[17], s2[22]); + s2[23] = highbd_idct_sub_dual(s1[16], s2[23]); + + s3[24] = highbd_idct_sub_dual(s1[31], s2[24]); + s3[25] = highbd_idct_sub_dual(s1[30], s2[25]); + s3[26] = highbd_idct_sub_dual(s1[29], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[28], s1[27]); + s2[28] = highbd_idct_add_dual(s1[27], s1[28]); + s2[29] = highbd_idct_add_dual(s1[26], s1[29]); + s2[30] = highbd_idct_add_dual(s2[25], s1[30]); + s2[31] = highbd_idct_add_dual(s2[24], s1[31]); + + // stage 7 + s1[0] = highbd_idct_add_dual(s2[0], s2[15]); + s1[1] = highbd_idct_add_dual(s2[1], s2[14]); + s1[2] = highbd_idct_add_dual(s2[2], s2[13]); + s1[3] = highbd_idct_add_dual(s2[3], s2[12]); + s1[4] = highbd_idct_add_dual(s2[4], s2[11]); + s1[5] = highbd_idct_add_dual(s2[5], s2[10]); + s1[6] = highbd_idct_add_dual(s2[6], s2[9]); + s1[7] = highbd_idct_add_dual(s2[7], s2[8]); + s1[8] = highbd_idct_sub_dual(s2[7], s2[8]); + s1[9] = highbd_idct_sub_dual(s2[6], s2[9]); + s1[10] = highbd_idct_sub_dual(s2[5], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[4], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[3], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[2], s2[13]); + s1[14] = highbd_idct_sub_dual(s2[1], s2[14]); + s1[15] = highbd_idct_sub_dual(s2[0], s2[15]); + + s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64); + + s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64); + + s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64); + + // final stage + s3[0] = highbd_idct_add_dual(s1[0], s2[31]); + s3[1] = highbd_idct_add_dual(s1[1], s2[30]); + s3[2] = highbd_idct_add_dual(s1[2], s2[29]); + s3[3] = highbd_idct_add_dual(s1[3], s2[28]); + s3[4] = highbd_idct_add_dual(s1[4], s1[27]); + s3[5] = highbd_idct_add_dual(s1[5], s1[26]); + s3[6] = highbd_idct_add_dual(s1[6], s1[25]); + s3[7] = highbd_idct_add_dual(s1[7], s1[24]); + s3[8] = highbd_idct_add_dual(s1[8], s1[23]); + s3[9] = highbd_idct_add_dual(s1[9], s1[22]); + s3[10] = highbd_idct_add_dual(s1[10], s1[21]); + s3[11] = highbd_idct_add_dual(s1[11], s1[20]); + s3[12] = highbd_idct_add_dual(s1[12], s2[19]); + s3[13] = highbd_idct_add_dual(s1[13], s2[18]); + s3[14] = highbd_idct_add_dual(s1[14], s2[17]); + s3[15] = highbd_idct_add_dual(s1[15], s2[16]); + s3[16] = highbd_idct_sub_dual(s1[15], s2[16]); + s3[17] = highbd_idct_sub_dual(s1[14], s2[17]); + s3[18] = highbd_idct_sub_dual(s1[13], s2[18]); + s3[19] = highbd_idct_sub_dual(s1[12], s2[19]); + s3[20] = highbd_idct_sub_dual(s1[11], s1[20]); + s3[21] = highbd_idct_sub_dual(s1[10], s1[21]); + s3[22] = highbd_idct_sub_dual(s1[9], s1[22]); + s3[23] = highbd_idct_sub_dual(s1[8], s1[23]); + s3[24] = highbd_idct_sub_dual(s1[7], s1[24]); + s3[25] = highbd_idct_sub_dual(s1[6], s1[25]); + s3[26] = highbd_idct_sub_dual(s1[5], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[4], s1[27]); + s3[28] = highbd_idct_sub_dual(s1[3], s2[28]); + s3[29] = highbd_idct_sub_dual(s1[2], s2[29]); + s3[30] = highbd_idct_sub_dual(s1[1], s2[30]); + s3[31] = highbd_idct_sub_dual(s1[0], s2[31]); + + vst1q_s32(output, s3[0].val[0]); + output += 4; + vst1q_s32(output, s3[0].val[1]); + output += 4; + vst1q_s32(output, s3[1].val[0]); + output += 4; + vst1q_s32(output, s3[1].val[1]); + output += 4; + vst1q_s32(output, s3[2].val[0]); + output += 4; + vst1q_s32(output, s3[2].val[1]); + output += 4; + vst1q_s32(output, s3[3].val[0]); + output += 4; + vst1q_s32(output, s3[3].val[1]); + output += 4; + vst1q_s32(output, s3[4].val[0]); + output += 4; + vst1q_s32(output, s3[4].val[1]); + output += 4; + vst1q_s32(output, s3[5].val[0]); + output += 4; + vst1q_s32(output, s3[5].val[1]); + output += 4; + vst1q_s32(output, s3[6].val[0]); + output += 4; + vst1q_s32(output, s3[6].val[1]); + output += 4; + vst1q_s32(output, s3[7].val[0]); + output += 4; + vst1q_s32(output, s3[7].val[1]); + output += 4; + + vst1q_s32(output, s3[8].val[0]); + output += 4; + vst1q_s32(output, s3[8].val[1]); + output += 4; + vst1q_s32(output, s3[9].val[0]); + output += 4; + vst1q_s32(output, s3[9].val[1]); + output += 4; + vst1q_s32(output, s3[10].val[0]); + output += 4; + vst1q_s32(output, s3[10].val[1]); + output += 4; + vst1q_s32(output, s3[11].val[0]); + output += 4; + vst1q_s32(output, s3[11].val[1]); + output += 4; + vst1q_s32(output, s3[12].val[0]); + output += 4; + vst1q_s32(output, s3[12].val[1]); + output += 4; + vst1q_s32(output, s3[13].val[0]); + output += 4; + vst1q_s32(output, s3[13].val[1]); + output += 4; + vst1q_s32(output, s3[14].val[0]); + output += 4; + vst1q_s32(output, s3[14].val[1]); + output += 4; + vst1q_s32(output, s3[15].val[0]); + output += 4; + vst1q_s32(output, s3[15].val[1]); + output += 4; + + vst1q_s32(output, s3[16].val[0]); + output += 4; + vst1q_s32(output, s3[16].val[1]); + output += 4; + vst1q_s32(output, s3[17].val[0]); + output += 4; + vst1q_s32(output, s3[17].val[1]); + output += 4; + vst1q_s32(output, s3[18].val[0]); + output += 4; + vst1q_s32(output, s3[18].val[1]); + output += 4; + vst1q_s32(output, s3[19].val[0]); + output += 4; + vst1q_s32(output, s3[19].val[1]); + output += 4; + vst1q_s32(output, s3[20].val[0]); + output += 4; + vst1q_s32(output, s3[20].val[1]); + output += 4; + vst1q_s32(output, s3[21].val[0]); + output += 4; + vst1q_s32(output, s3[21].val[1]); + output += 4; + vst1q_s32(output, s3[22].val[0]); + output += 4; + vst1q_s32(output, s3[22].val[1]); + output += 4; + vst1q_s32(output, s3[23].val[0]); + output += 4; + vst1q_s32(output, s3[23].val[1]); + output += 4; + + vst1q_s32(output, s3[24].val[0]); + output += 4; + vst1q_s32(output, s3[24].val[1]); + output += 4; + vst1q_s32(output, s3[25].val[0]); + output += 4; + vst1q_s32(output, s3[25].val[1]); + output += 4; + vst1q_s32(output, s3[26].val[0]); + output += 4; + vst1q_s32(output, s3[26].val[1]); + output += 4; + vst1q_s32(output, s3[27].val[0]); + output += 4; + vst1q_s32(output, s3[27].val[1]); + output += 4; + vst1q_s32(output, s3[28].val[0]); + output += 4; + vst1q_s32(output, s3[28].val[1]); + output += 4; + vst1q_s32(output, s3[29].val[0]); + output += 4; + vst1q_s32(output, s3[29].val[1]); + output += 4; + vst1q_s32(output, s3[30].val[0]); + output += 4; + vst1q_s32(output, s3[30].val[1]); + output += 4; + vst1q_s32(output, s3[31].val[0]); + output += 4; + vst1q_s32(output, s3[31].val[1]); +} + +static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output, + int stride, const int bd) { + int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32]; + + load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + // Different for _8_ + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + // Different for _8_ + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64, + s1[28], -cospi_4_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64, + s1[28], cospi_28_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64, + s1[27], cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64, + s2[12], -cospi_8_64); + s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64, + s2[12], cospi_24_64); + + s2[16] = highbd_idct_add_dual(s1[16], s1[19]); + + s2[17] = highbd_idct_add_dual(s1[17], s1[18]); + s2[18] = highbd_idct_sub_dual(s1[17], s1[18]); + + s2[19] = highbd_idct_sub_dual(s1[16], s1[19]); + + s2[20] = highbd_idct_sub_dual(s1[23], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[22], s1[21]); + + s2[22] = highbd_idct_add_dual(s1[21], s1[22]); + s2[23] = highbd_idct_add_dual(s1[20], s1[23]); + + s2[24] = highbd_idct_add_dual(s1[24], s1[27]); + s2[25] = highbd_idct_add_dual(s1[25], s1[26]); + s2[26] = highbd_idct_sub_dual(s1[25], s1[26]); + s2[27] = highbd_idct_sub_dual(s1[24], s1[27]); + + s2[28] = highbd_idct_sub_dual(s1[31], s1[28]); + s2[29] = highbd_idct_sub_dual(s1[30], s1[29]); + s2[30] = highbd_idct_add_dual(s1[29], s1[30]); + s2[31] = highbd_idct_add_dual(s1[28], s1[31]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64); + + s1[8] = highbd_idct_add_dual(s2[8], s2[11]); + s1[9] = highbd_idct_add_dual(s2[9], s2[10]); + s1[10] = highbd_idct_sub_dual(s2[9], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[8], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[15], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[14], s2[13]); + s1[14] = highbd_idct_add_dual(s2[13], s2[14]); + s1[15] = highbd_idct_add_dual(s2[12], s2[15]); + + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64, + s2[29], cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64, + s2[29], cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64, + s2[28], cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64, + s2[28], cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64, + s2[27], cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64, + s2[26], cospi_24_64); + + // stage 6 + s2[0] = highbd_idct_add_dual(s1[0], s1[7]); + s2[1] = highbd_idct_add_dual(s1[0], s1[6]); + s2[2] = highbd_idct_add_dual(s1[0], s1[5]); + s2[3] = highbd_idct_add_dual(s1[0], s1[4]); + s2[4] = highbd_idct_sub_dual(s1[0], s1[4]); + s2[5] = highbd_idct_sub_dual(s1[0], s1[5]); + s2[6] = highbd_idct_sub_dual(s1[0], s1[6]); + s2[7] = highbd_idct_sub_dual(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64); + + s1[16] = highbd_idct_add_dual(s2[16], s2[23]); + s1[17] = highbd_idct_add_dual(s2[17], s2[22]); + s2[18] = highbd_idct_add_dual(s1[18], s1[21]); + s2[19] = highbd_idct_add_dual(s1[19], s1[20]); + s2[20] = highbd_idct_sub_dual(s1[19], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[18], s1[21]); + s1[22] = highbd_idct_sub_dual(s2[17], s2[22]); + s1[23] = highbd_idct_sub_dual(s2[16], s2[23]); + + s3[24] = highbd_idct_sub_dual(s2[31], s2[24]); + s3[25] = highbd_idct_sub_dual(s2[30], s2[25]); + s3[26] = highbd_idct_sub_dual(s1[29], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[28], s1[27]); + s2[28] = highbd_idct_add_dual(s1[27], s1[28]); + s2[29] = highbd_idct_add_dual(s1[26], s1[29]); + s2[30] = highbd_idct_add_dual(s2[25], s2[30]); + s2[31] = highbd_idct_add_dual(s2[24], s2[31]); + + // stage 7 + s1[0] = highbd_idct_add_dual(s2[0], s1[15]); + s1[1] = highbd_idct_add_dual(s2[1], s1[14]); + s1[2] = highbd_idct_add_dual(s2[2], s2[13]); + s1[3] = highbd_idct_add_dual(s2[3], s2[12]); + s1[4] = highbd_idct_add_dual(s2[4], s2[11]); + s1[5] = highbd_idct_add_dual(s2[5], s2[10]); + s1[6] = highbd_idct_add_dual(s2[6], s1[9]); + s1[7] = highbd_idct_add_dual(s2[7], s1[8]); + s1[8] = highbd_idct_sub_dual(s2[7], s1[8]); + s1[9] = highbd_idct_sub_dual(s2[6], s1[9]); + s1[10] = highbd_idct_sub_dual(s2[5], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[4], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[3], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[2], s2[13]); + s1[14] = highbd_idct_sub_dual(s2[1], s1[14]); + s1[15] = highbd_idct_sub_dual(s2[0], s1[15]); + + s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64); + + s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64); + + s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64); + + // final stage + out[0] = highbd_idct_add_dual(s1[0], s2[31]); + out[1] = highbd_idct_add_dual(s1[1], s2[30]); + out[2] = highbd_idct_add_dual(s1[2], s2[29]); + out[3] = highbd_idct_add_dual(s1[3], s2[28]); + out[4] = highbd_idct_add_dual(s1[4], s1[27]); + out[5] = highbd_idct_add_dual(s1[5], s1[26]); + out[6] = highbd_idct_add_dual(s1[6], s1[25]); + out[7] = highbd_idct_add_dual(s1[7], s1[24]); + out[8] = highbd_idct_add_dual(s1[8], s2[23]); + out[9] = highbd_idct_add_dual(s1[9], s2[22]); + out[10] = highbd_idct_add_dual(s1[10], s1[21]); + out[11] = highbd_idct_add_dual(s1[11], s1[20]); + out[12] = highbd_idct_add_dual(s1[12], s2[19]); + out[13] = highbd_idct_add_dual(s1[13], s2[18]); + out[14] = highbd_idct_add_dual(s1[14], s1[17]); + out[15] = highbd_idct_add_dual(s1[15], s1[16]); + out[16] = highbd_idct_sub_dual(s1[15], s1[16]); + out[17] = highbd_idct_sub_dual(s1[14], s1[17]); + out[18] = highbd_idct_sub_dual(s1[13], s2[18]); + out[19] = highbd_idct_sub_dual(s1[12], s2[19]); + out[20] = highbd_idct_sub_dual(s1[11], s1[20]); + out[21] = highbd_idct_sub_dual(s1[10], s1[21]); + out[22] = highbd_idct_sub_dual(s1[9], s2[22]); + out[23] = highbd_idct_sub_dual(s1[8], s2[23]); + out[24] = highbd_idct_sub_dual(s1[7], s1[24]); + out[25] = highbd_idct_sub_dual(s1[6], s1[25]); + out[26] = highbd_idct_sub_dual(s1[5], s1[26]); + out[27] = highbd_idct_sub_dual(s1[4], s1[27]); + out[28] = highbd_idct_sub_dual(s1[3], s2[28]); + out[29] = highbd_idct_sub_dual(s1[2], s2[29]); + out[30] = highbd_idct_sub_dual(s1[1], s2[30]); + out[31] = highbd_idct_sub_dual(s1[0], s2[31]); + + highbd_idct16x16_add_store(out, output, stride, bd); + highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); +} + +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + + if (bd == 8) { + int16_t temp[32 * 8]; + int16_t *t = temp; + + vpx_idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_8_neon(t, dest, stride, 1); + t += (8 * 8); + dest += 8; + } + } else { + int32_t temp[32 * 8]; + int32_t *t = temp; + + vpx_highbd_idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + vpx_highbd_idct32_8_neon(t, dest, stride, bd); + t += (8 * 8); + dest += 8; + } + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c new file mode 100644 index 0000000000..c1354c0c1a --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a0 = vld1q_u16(*dest); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const uint16x8_t a2 = vld1q_u16(*dest + 16); + const uint16x8_t a3 = vld1q_u16(*dest + 24); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); + const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); + const int16x8_t c0 = vminq_s16(b0, max); + const int16x8_t c1 = vminq_s16(b1, max); + const int16x8_t c2 = vminq_s16(b2, max); + const int16x8_t c3 = vminq_s16(b3, max); + vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); + vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); + vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); + vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); + *dest += stride; +} + +static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a0 = vld1q_u16(*dest); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const uint16x8_t a2 = vld1q_u16(*dest + 16); + const uint16x8_t a3 = vld1q_u16(*dest + 24); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); + const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); + const uint16x8_t c0 = vqshluq_n_s16(b0, 0); + const uint16x8_t c1 = vqshluq_n_s16(b1, 0); + const uint16x8_t c2 = vqshluq_n_s16(b2, 0); + const uint16x8_t c3 = vqshluq_n_s16(b3, 0); + vst1q_u16(*dest, c0); + vst1q_u16(*dest + 8, c1); + vst1q_u16(*dest + 16, c2); + vst1q_u16(*dest + 24, c3); + *dest += stride; +} + +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + const int16x8_t dc = vdupq_n_s16(a1); + int i; + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + for (i = 0; i < 8; ++i) { + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + } + } else { + for (i = 0; i < 8; ++i) { + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c new file mode 100644 index 0000000000..1418a75a15 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; +} + +// res is in reverse row order +static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; +} + +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); + const int16x8_t dc = vdupq_n_s16(a1); + + highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); + highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); +} + +static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, + int32x4_t *const a0, + int32x4_t *const a1, + int32x4_t *const a2, + int32x4_t *const a3) { + int32x4_t b0, b1, b2, b3; + + transpose_s32_4x4(a0, a1, a2, a3); + b0 = vaddq_s32(*a0, *a2); + b1 = vsubq_s32(*a0, *a2); + b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); + b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); + b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); + b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); + b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); + b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); + b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); + b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); + b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); + b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); + *a0 = vaddq_s32(b0, b3); + *a1 = vaddq_s32(b1, b2); + *a2 = vsubq_s32(b1, b2); + *a3 = vsubq_s32(b0, b3); +} + +static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, + int32x4_t *const a0, + int32x4_t *const a1, + int32x4_t *const a2, + int32x4_t *const a3) { + int32x4_t b0, b1, b2, b3; + int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; + + transpose_s32_4x4(a0, a1, a2, a3); + b0 = vaddq_s32(*a0, *a2); + b1 = vsubq_s32(*a0, *a2); + c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); + c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); + c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); + c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); + c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); + c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); + c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); + c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); + c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); + c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); + c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); + c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); + c4 = vsubq_s64(c4, c8); + c5 = vsubq_s64(c5, c9); + c6 = vaddq_s64(c6, c10); + c7 = vaddq_s64(c7, c11); + b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), + vrshrn_n_s64(c1, DCT_CONST_BITS)); + b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), + vrshrn_n_s64(c3, DCT_CONST_BITS)); + b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), + vrshrn_n_s64(c5, DCT_CONST_BITS)); + b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), + vrshrn_n_s64(c7, DCT_CONST_BITS)); + *a0 = vaddq_s32(b0, b3); + *a1 = vaddq_s32(b1, b2); + *a2 = vsubq_s32(b1, b2); + *a3 = vsubq_s32(b0, b3); +} + +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int32x4_t c0 = vld1q_s32(input); + int32x4_t c1 = vld1q_s32(input + 4); + int32x4_t c2 = vld1q_s32(input + 8); + int32x4_t c3 = vld1q_s32(input + 12); + int16x8_t a0, a1; + + if (bd == 8) { + const int16x4_t cospis = vld1_s16(kCospi); + + // Rows + a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); + a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + + // Columns + a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a0 = vrshrq_n_s16(a0, 4); + a1 = vrshrq_n_s16(a1, 4); + } else { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + } else { + idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + } + a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); + a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); + } + + highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max); + highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max); +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c new file mode 100644 index 0000000000..dd90134a6e --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -0,0 +1,631 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a = vld1q_u16(*dest); + const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); + const int16x8_t c = vminq_s16(b, max); + vst1q_u16(*dest, vreinterpretq_u16_s16(c)); + *dest += stride; +} + +static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a = vld1q_u16(*dest); + const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); + const uint16x8_t c = vqshluq_n_s16(b, 0); + vst1q_u16(*dest, c); + *dest += stride; +} + +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); + const int16x8_t dc = vdupq_n_s16(a1); + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + } else { + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + } +} + +static INLINE void idct8x8_12_half1d_bd10( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x4_t step1[8], step2[8]; + + transpose_s32_4x4(io0, io1, io2, io3); + + // stage 1 + step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); + step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); + step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); + step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); + + // stage 2 + step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); + step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); + step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[1], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[1], step2[3]); + + step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); + step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_half1d_bd12( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x2_t input_1l, input_1h, input_3l, input_3h; + int32x2_t step1l[2], step1h[2]; + int32x4_t step1[8], step2[8]; + int64x2_t t64[8]; + int32x2_t t32[8]; + + transpose_s32_4x4(io0, io1, io2, io3); + + // stage 1 + input_1l = vget_low_s32(*io1); + input_1h = vget_high_s32(*io1); + input_3l = vget_low_s32(*io3); + input_3h = vget_high_s32(*io3); + step1l[0] = vget_low_s32(*io0); + step1h[0] = vget_high_s32(*io0); + step1l[1] = vget_low_s32(*io2); + step1h[1] = vget_high_s32(*io2); + + t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step1[4] = vcombine_s32(t32[0], t32[1]); + step1[5] = vcombine_s32(t32[2], t32[3]); + step1[6] = vcombine_s32(t32[4], t32[5]); + step1[7] = vcombine_s32(t32[6], t32[7]); + + // stage 2 + t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); + t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); + t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); + t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); + t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step2[1] = vcombine_s32(t32[2], t32[3]); + step2[2] = vcombine_s32(t32[4], t32[5]); + step2[3] = vcombine_s32(t32[6], t32[7]); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[1], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[1], step2[3]); + + t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[0] = + vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t64[2] = + vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + step1[5] = vcombine_s32(t32[0], t32[1]); + step1[6] = vcombine_s32(t32[2], t32[3]); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, + int16x8_t a3, int16x8_t a4, int16x8_t a5, + int16x8_t a6, int16x8_t a7, uint16_t *dest, + const int stride, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const uint16_t *dst = dest; + uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; + int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16; + + d0 = vld1q_u16(dst); + dst += stride; + d1 = vld1q_u16(dst); + dst += stride; + d2 = vld1q_u16(dst); + dst += stride; + d3 = vld1q_u16(dst); + dst += stride; + d4 = vld1q_u16(dst); + dst += stride; + d5 = vld1q_u16(dst); + dst += stride; + d6 = vld1q_u16(dst); + dst += stride; + d7 = vld1q_u16(dst); + + d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0)); + d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1)); + d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2)); + d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3)); + d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4)); + d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5)); + d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6)); + d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7)); + + d0_s16 = vminq_s16(d0_s16, max); + d1_s16 = vminq_s16(d1_s16, max); + d2_s16 = vminq_s16(d2_s16, max); + d3_s16 = vminq_s16(d3_s16, max); + d4_s16 = vminq_s16(d4_s16, max); + d5_s16 = vminq_s16(d5_s16, max); + d6_s16 = vminq_s16(d6_s16, max); + d7_s16 = vminq_s16(d7_s16, max); + d0_u16 = vqshluq_n_s16(d0_s16, 0); + d1_u16 = vqshluq_n_s16(d1_s16, 0); + d2_u16 = vqshluq_n_s16(d2_s16, 0); + d3_u16 = vqshluq_n_s16(d3_s16, 0); + d4_u16 = vqshluq_n_s16(d4_s16, 0); + d5_u16 = vqshluq_n_s16(d5_s16, 0); + d6_u16 = vqshluq_n_s16(d6_s16, 0); + d7_u16 = vqshluq_n_s16(d7_s16, 0); + + vst1q_u16(dest, d0_u16); + dest += stride; + vst1q_u16(dest, d1_u16); + dest += stride; + vst1q_u16(dest, d2_u16); + dest += stride; + vst1q_u16(dest, d3_u16); + dest += stride; + vst1q_u16(dest, d4_u16); + dest += stride; + vst1q_u16(dest, d5_u16); + dest += stride; + vst1q_u16(dest, d6_u16); + dest += stride; + vst1q_u16(dest, d7_u16); +} + +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int32x4_t a0 = vld1q_s32(input); + int32x4_t a1 = vld1q_s32(input + 8); + int32x4_t a2 = vld1q_s32(input + 16); + int32x4_t a3 = vld1q_s32(input + 24); + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + + if (bd == 8) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x8_t cospisd = vaddq_s16(cospis, cospis); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 + const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 + int16x4_t b0 = vmovn_s32(a0); + int16x4_t b1 = vmovn_s32(a1); + int16x4_t b2 = vmovn_s32(a2); + int16x4_t b3 = vmovn_s32(a3); + int16x4_t b4, b5, b6, b7; + + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4, + &b5, &b6, &b7); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5, + b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7); + c0 = vrshrq_n_s16(c0, 5); + c1 = vrshrq_n_s16(c1, 5); + c2 = vrshrq_n_s16(c2, 5); + c3 = vrshrq_n_s16(c3, 5); + c4 = vrshrq_n_s16(c4, 5); + c5 = vrshrq_n_s16(c5, 5); + c6 = vrshrq_n_s16(c6, 5); + c7 = vrshrq_n_s16(c7, 5); + } else { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15; + + if (bd == 10) { + idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, + &a6, &a7); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, + &a10, &a11); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, + &a14, &a15); + } else { + idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, + &a6, &a7); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, + &a10, &a11); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, + &a14, &a15); + } + c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); + c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); + c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); + c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); + c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); + c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); + c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); + c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + } + highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); +} + +static INLINE void idct8x8_64_half1d_bd10( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x4_t step1[8], step2[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); + step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); + step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); + step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); + + step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0); + step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1); + step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); + step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); + + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); + + // stage 2 + step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); + step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); + step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); + + step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); + step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); + + step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); + step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_half1d_bd12( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, + input_7l, input_7h; + int32x2_t step1l[4], step1h[4]; + int32x4_t step1[8], step2[8]; + int64x2_t t64[8]; + int32x2_t t32[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + input_1l = vget_low_s32(*io1); + input_1h = vget_high_s32(*io1); + input_3l = vget_low_s32(*io3); + input_3h = vget_high_s32(*io3); + input_5l = vget_low_s32(*io5); + input_5h = vget_high_s32(*io5); + input_7l = vget_low_s32(*io7); + input_7h = vget_high_s32(*io7); + step1l[0] = vget_low_s32(*io0); + step1h[0] = vget_high_s32(*io0); + step1l[1] = vget_low_s32(*io2); + step1h[1] = vget_high_s32(*io2); + step1l[2] = vget_low_s32(*io4); + step1h[2] = vget_high_s32(*io4); + step1l[3] = vget_low_s32(*io6); + step1h[3] = vget_high_s32(*io6); + + t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); + t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0); + t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0); + t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1); + t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1); + t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0); + t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0); + t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1); + t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step1[4] = vcombine_s32(t32[0], t32[1]); + step1[5] = vcombine_s32(t32[2], t32[3]); + step1[6] = vcombine_s32(t32[4], t32[5]); + step1[7] = vcombine_s32(t32[6], t32[7]); + + // stage 2 + t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); + t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); + t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); + t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); + t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); + t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1); + t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); + t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); + t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step2[0] = vcombine_s32(t32[0], t32[1]); + step2[1] = vcombine_s32(t32[2], t32[3]); + step2[2] = vcombine_s32(t32[4], t32[5]); + step2[3] = vcombine_s32(t32[6], t32[7]); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[0] = + vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t64[2] = + vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + step1[5] = vcombine_s32(t32[0], t32[1]); + step1[6] = vcombine_s32(t32[2], t32[3]); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int32x4_t a0 = vld1q_s32(input); + int32x4_t a1 = vld1q_s32(input + 4); + int32x4_t a2 = vld1q_s32(input + 8); + int32x4_t a3 = vld1q_s32(input + 12); + int32x4_t a4 = vld1q_s32(input + 16); + int32x4_t a5 = vld1q_s32(input + 20); + int32x4_t a6 = vld1q_s32(input + 24); + int32x4_t a7 = vld1q_s32(input + 28); + int32x4_t a8 = vld1q_s32(input + 32); + int32x4_t a9 = vld1q_s32(input + 36); + int32x4_t a10 = vld1q_s32(input + 40); + int32x4_t a11 = vld1q_s32(input + 44); + int32x4_t a12 = vld1q_s32(input + 48); + int32x4_t a13 = vld1q_s32(input + 52); + int32x4_t a14 = vld1q_s32(input + 56); + int32x4_t a15 = vld1q_s32(input + 60); + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + + if (bd == 8) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1)); + int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3)); + int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5)); + int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7)); + int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9)); + int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11)); + int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13)); + int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15)); + + idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); + idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); + + c0 = vrshrq_n_s16(b0, 5); + c1 = vrshrq_n_s16(b1, 5); + c2 = vrshrq_n_s16(b2, 5); + c3 = vrshrq_n_s16(b3, 5); + c4 = vrshrq_n_s16(b4, 5); + c5 = vrshrq_n_s16(b5, 5); + c6 = vrshrq_n_s16(b6, 5); + c7 = vrshrq_n_s16(b7, 5); + } else { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + if (bd == 10) { + idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, + &a6, &a7); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, + &a14, &a15); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, + &a3, &a11); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, + &a7, &a15); + } else { + idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, + &a6, &a7); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, + &a14, &a15); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, + &a3, &a11); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, + &a7, &a15); + } + c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); + c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); + c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); + c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); + c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); + c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); + c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); + c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + } + highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c new file mode 100644 index 0000000000..6f7e5da762 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c @@ -0,0 +1,1078 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) { + const uint16x4_t ref_u16 = vld1_u16(ref); + const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16); + return vpadd_u16(p0, p0); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + const uint16x4_t dc_dup = vdup_lane_u16(dc, 0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + vst1_u16(dst, dc_dup); + } +} + +void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t a = vld1_u16(above); + const uint16x4_t l = vld1_u16(left); + uint16x4_t sum; + uint16x4_t dc; + (void)bd; + sum = vadd_u16(a, l); + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vrshr_n_u16(sum, 3); + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_4(left); + const uint16x4_t dc = vrshr_n_u16(sum, 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_4(above); + const uint16x4_t dc = vrshr_n_u16(sum, 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_4x4(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) { + const uint16x8_t ref_u16 = vld1q_u16(ref); + uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} + +static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + vst1q_u16(dst, dc_dup); + } +} + +void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t above_u16 = vld1q_u16(above); + const uint16x8_t left_u16 = vld1q_u16(left); + const uint16x8_t p0 = vaddq_u16(above_u16, left_u16); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + uint16x4_t dc; + (void)bd; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vrshr_n_u16(sum, 4); + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_8(left); + const uint16x4_t dc = vrshr_n_u16(sum, 3); + (void)above; + (void)bd; + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_8(above); + const uint16x4_t dc = vrshr_n_u16(sum, 3); + (void)left; + (void)bd; + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_8x8(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) { + const uint16x8x2_t ref_u16 = vld2q_u16(ref); + const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} + +static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + uint16x8x2_t dc_dup; + int i; + dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); + for (i = 0; i < 16; ++i, dst += stride) { + vst2q_u16(dst, dc_dup); + } +} + +void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x2_t a = vld2q_u16(above); + const uint16x8x2_t l = vld2q_u16(left); + const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]); + const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]); + const uint16x8_t pal0 = vaddq_u16(pa, pl); + uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); + uint32x2_t sum; + uint16x4_t dc; + (void)bd; + pal1 = vpadd_u16(pal1, pal1); + sum = vpaddl_u16(pal1); + dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_16(left); + const uint16x4_t dc = vrshr_n_u16(sum, 4); + (void)above; + (void)bd; + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_16(above); + const uint16x4_t dc = vrshr_n_u16(sum, 4); + (void)left; + (void)bd; + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_16x16(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { + const uint16x8x4_t r = vld4q_u16(ref); + const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]); + const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]); + const uint16x8_t p2 = vaddq_u16(p0, p1); + uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + sum = vpadd_u16(sum, sum); + return vpaddl_u16(sum); +} + +static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + uint16x8x2_t dc_dup; + int i; + dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); + + for (i = 0; i < 32; ++i) { + vst2q_u16(dst, dc_dup); + dst += 16; + vst2q_u16(dst, dc_dup); + dst += stride - 16; + } +} + +void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x4_t a = vld4q_u16(above); + const uint16x8x4_t l = vld4q_u16(left); + const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]); + const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]); + const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]); + const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]); + const uint16x8_t pa = vaddq_u16(pa0, pa1); + const uint16x8_t pl = vaddq_u16(pl0, pl1); + const uint16x8_t pal0 = vaddq_u16(pa, pl); + const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); + uint32x2_t sum = vpaddl_u16(pal1); + uint16x4_t dc; + (void)bd; + sum = vpadd_u32(sum, sum); + dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6)); + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint32x2_t sum = dc_sum_32(left); + const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + (void)above; + (void)bd; + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint32x2_t sum = dc_sum_32(above); + const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + (void)left; + (void)bd; + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_32x32(dst, stride, dc); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t ABCDEFGH = vld1q_u16(above); + const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1); + const uint16x8_t CDEFGH00 = vld1q_u16(above + 2); + const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00); + const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0); + const uint16x4_t avg2_low = vget_low_u16(avg2); + const uint16x4_t avg2_high = vget_high_u16(avg2); + const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1); + const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2); + const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3); + (void)left; + (void)bd; + vst1_u16(dst, avg2_low); + dst += stride; + vst1_u16(dst, r1); + dst += stride; + vst1_u16(dst, r2); + dst += stride; + vst1_u16(dst, r3); + vst1q_lane_u16(dst + 3, ABCDEFGH, 7); +} + +static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t above_right, uint16x8_t *row) { + *row = vextq_u16(*row, above_right, 1); + vst1q_u16(*dst, *row); + *dst += stride; +} + +void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0 = vld1q_u16(above); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3); + const uint16x8_t A1 = vld1q_u16(above + 1); + const uint16x8_t A2 = vld1q_u16(above + 2); + const uint16x8_t avg1 = vhaddq_u16(A0, A2); + uint16x8_t row = vrhaddq_u16(avg1, A1); + (void)left; + (void)bd; + + vst1q_u16(dst, row); + dst += stride; + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + vst1q_u16(dst, above_right); +} + +static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t above_right, uint16x8_t *row_0, + uint16x8_t *row_1) { + *row_0 = vextq_u16(*row_0, *row_1, 1); + *row_1 = vextq_u16(*row_1, above_right, 1); + vst1q_u16(*dst, *row_0); + *dst += 8; + vst1q_u16(*dst, *row_1); + *dst += stride - 8; +} + +void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0_0 = vld1q_u16(above); + const uint16x8_t A0_1 = vld1q_u16(above + 8); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3); + const uint16x8_t A1_0 = vld1q_u16(above + 1); + const uint16x8_t A1_1 = vld1q_u16(above + 9); + const uint16x8_t A2_0 = vld1q_u16(above + 2); + const uint16x8_t A2_1 = vld1q_u16(above + 10); + const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); + const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); + uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); + uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + (void)left; + (void)bd; + + vst1q_u16(dst, row_0); + vst1q_u16(dst + 8, row_1); + dst += stride; + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + vst1q_u16(dst, above_right); + vst1q_u16(dst + 8, above_right); +} + +void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0_0 = vld1q_u16(above); + const uint16x8_t A0_1 = vld1q_u16(above + 8); + const uint16x8_t A0_2 = vld1q_u16(above + 16); + const uint16x8_t A0_3 = vld1q_u16(above + 24); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3); + const uint16x8_t A1_0 = vld1q_u16(above + 1); + const uint16x8_t A1_1 = vld1q_u16(above + 9); + const uint16x8_t A1_2 = vld1q_u16(above + 17); + const uint16x8_t A1_3 = vld1q_u16(above + 25); + const uint16x8_t A2_0 = vld1q_u16(above + 2); + const uint16x8_t A2_1 = vld1q_u16(above + 10); + const uint16x8_t A2_2 = vld1q_u16(above + 18); + const uint16x8_t A2_3 = vld1q_u16(above + 26); + const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); + const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); + const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2); + const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3); + uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); + uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2); + uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3); + int i; + (void)left; + (void)bd; + + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst += stride - 24; + + for (i = 0; i < 30; ++i) { + row_0 = vextq_u16(row_0, row_1, 1); + row_1 = vextq_u16(row_1, row_2, 1); + row_2 = vextq_u16(row_2, row_3, 1); + row_3 = vextq_u16(row_3, above_right, 1); + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst += stride - 24; + } + + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t XA0123___ = vld1q_u16(above - 1); + const uint16x4_t L0123 = vld1_u16(left); + const uint16x4_t L3210 = vrev64_u16(L0123); + const uint16x8_t L____3210 = vcombine_u16(L0123, L3210); + const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___)); + const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5); + const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6); + const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_); + const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123); + const uint16x4_t row_0 = vget_low_u16(avg2); + const uint16x4_t row_1 = vget_high_u16(avg2); + const uint16x4_t r0 = vext_u16(row_0, row_1, 3); + const uint16x4_t r1 = vext_u16(row_0, row_1, 2); + const uint16x4_t r2 = vext_u16(row_0, row_1, 1); + (void)bd; + vst1_u16(dst, r0); + dst += stride; + vst1_u16(dst, r1); + dst += stride; + vst1_u16(dst, r2); + dst += stride; + vst1_u16(dst, row_0); +} + +void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t XA0123456 = vld1q_u16(above - 1); + const uint16x8_t A01234567 = vld1q_u16(above); + const uint16x8_t A1234567_ = vld1q_u16(above + 1); + const uint16x8_t L01234567 = vld1q_u16(left); + const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567)); + const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567)); + const uint16x8_t L76543210 = vcombine_u16(L7654, L3210); + const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1); + const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2); + const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0); + const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_); + const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X); + const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567); + const uint16x8_t r0 = vextq_u16(row_0, row_1, 7); + const uint16x8_t r1 = vextq_u16(row_0, row_1, 6); + const uint16x8_t r2 = vextq_u16(row_0, row_1, 5); + const uint16x8_t r3 = vextq_u16(row_0, row_1, 4); + const uint16x8_t r4 = vextq_u16(row_0, row_1, 3); + const uint16x8_t r5 = vextq_u16(row_0, row_1, 2); + const uint16x8_t r6 = vextq_u16(row_0, row_1, 1); + (void)bd; + vst1q_u16(dst, r0); + dst += stride; + vst1q_u16(dst, r1); + dst += stride; + vst1q_u16(dst, r2); + dst += stride; + vst1q_u16(dst, r3); + dst += stride; + vst1q_u16(dst, r4); + dst += stride; + vst1q_u16(dst, r5); + dst += stride; + vst1q_u16(dst, r6); + dst += stride; + vst1q_u16(dst, row_0); +} + +static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row_0, + const uint16x8_t row_1) { + vst1q_u16(*dst, row_0); + *dst += 8; + vst1q_u16(*dst, row_1); + *dst += stride - 8; +} + +void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t L01234567 = vld1q_u16(left); + const uint16x8_t L89abcdef = vld1q_u16(left + 8); + const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567)); + const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567)); + const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef)); + const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef)); + const uint16x8_t L76543210 = vcombine_u16(L7654, L3210); + const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98); + const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1); + const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2); + const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876); + const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987); + + const uint16x8_t XA0123456 = vld1q_u16(above - 1); + const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1); + const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2); + const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0); + const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X); + + const uint16x8_t A01234567 = vld1q_u16(above); + const uint16x8_t A12345678 = vld1q_u16(above + 1); + const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678); + const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567); + + const uint16x8_t A789abcde = vld1q_u16(above + 7); + const uint16x8_t A89abcdef = vld1q_u16(above + 8); + const uint16x8_t A9abcdef_ = vld1q_u16(above + 9); + const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_); + const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef); + + const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7); + const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7); + const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6); + const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6); + const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5); + const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5); + const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4); + const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4); + const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3); + const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3); + const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2); + const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2); + const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1); + const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1); + const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7); + const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6); + const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5); + const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4); + const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3); + const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2); + const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1); + (void)bd; + + d135_store_16(&dst, stride, r0_0, r0_1); + d135_store_16(&dst, stride, r1_0, r1_1); + d135_store_16(&dst, stride, r2_0, r2_1); + d135_store_16(&dst, stride, r3_0, r3_1); + d135_store_16(&dst, stride, r4_0, r4_1); + d135_store_16(&dst, stride, r5_0, r5_1); + d135_store_16(&dst, stride, r6_0, r6_1); + d135_store_16(&dst, stride, row_1, row_2); + d135_store_16(&dst, stride, r8_0, r0_0); + d135_store_16(&dst, stride, r9_0, r1_0); + d135_store_16(&dst, stride, ra_0, r2_0); + d135_store_16(&dst, stride, rb_0, r3_0); + d135_store_16(&dst, stride, rc_0, r4_0); + d135_store_16(&dst, stride, rd_0, r5_0); + d135_store_16(&dst, stride, re_0, r6_0); + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); +} + +void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t LL01234567 = vld1q_u16(left + 16); + const uint16x8_t LL89abcdef = vld1q_u16(left + 24); + const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567)); + const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567)); + const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef)); + const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef)); + const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210); + const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98); + const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1); + const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2); + const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876); + uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987); + + const uint16x8_t LU01234567 = vld1q_u16(left); + const uint16x8_t LU89abcdef = vld1q_u16(left + 8); + const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567)); + const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567)); + const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef)); + const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef)); + const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210); + const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98); + const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1); + const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2); + const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe); + uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf); + + const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1); + const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2); + const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876); + uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987); + + const uint16x8_t XAL0123456 = vld1q_u16(above - 1); + const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1); + const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2); + const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0); + uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X); + + const uint16x8_t AL01234567 = vld1q_u16(above); + const uint16x8_t AL12345678 = vld1q_u16(above + 1); + const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678); + uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567); + + const uint16x8_t AL789abcde = vld1q_u16(above + 7); + const uint16x8_t AL89abcdef = vld1q_u16(above + 8); + const uint16x8_t AL9abcdefg = vld1q_u16(above + 9); + const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg); + uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef); + + const uint16x8_t ALfR0123456 = vld1q_u16(above + 15); + const uint16x8_t AR01234567 = vld1q_u16(above + 16); + const uint16x8_t AR12345678 = vld1q_u16(above + 17); + const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678); + uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567); + + const uint16x8_t AR789abcde = vld1q_u16(above + 23); + const uint16x8_t AR89abcdef = vld1q_u16(above + 24); + const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25); + const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_); + uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef); + int i, j; + (void)bd; + + dst += 31 * stride; + for (i = 0; i < 4; ++i) { + for (j = 0; j < 8; ++j) { + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst -= stride + 24; + row_0 = vextq_u16(row_0, row_1, 1); + row_1 = vextq_u16(row_1, row_2, 1); + row_2 = vextq_u16(row_2, row_3, 1); + row_3 = vextq_u16(row_3, row_4, 1); + row_4 = vextq_u16(row_4, row_4, 1); + } + row_4 = row_5; + row_5 = row_6; + row_6 = row_7; + } +} + +//------------------------------------------------------------------------------ + +void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t row = vld1_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 4; i++, dst += stride) { + vst1_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t row = vld1q_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 8; i++, dst += stride) { + vst1q_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x2_t row = vld2q_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 16; i++, dst += stride) { + vst2q_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x2_t row0 = vld2q_u16(above); + const uint16x8x2_t row1 = vld2q_u16(above + 16); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 32; i++) { + vst2q_u16(dst, row0); + dst += 16; + vst2q_u16(dst, row1); + dst += stride - 16; + } +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t left_u16 = vld1_u16(left); + uint16x4_t row; + (void)above; + (void)bd; + + row = vdup_lane_u16(left_u16, 0); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 1); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 2); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 3); + vst1_u16(dst, row); +} + +void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t left_u16 = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16); + const uint16x4_t left_high = vget_high_u16(left_u16); + uint16x8_t row; + (void)above; + (void)bd; + + row = vdupq_lane_u16(left_low, 0); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 1); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 2); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 3); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 0); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 1); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 2); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 3); + vst1q_u16(dst, row); +} + +static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row) { + // Note: vst1q is faster than vst2q + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += stride - 8; +} + +void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const uint16x8_t left_u16q = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16q); + const uint16x4_t left_high = vget_high_u16(left_u16q); + uint16x8_t row; + + row = vdupq_lane_u16(left_low, 0); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 1); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 2); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 3); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 0); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 1); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 2); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 3); + h_store_16(&dst, stride, row); + } +} + +static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row) { + // Note: vst1q is faster than vst2q + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += stride - 24; +} + +void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const uint16x8_t left_u16q = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16q); + const uint16x4_t left_high = vget_high_u16(left_u16q); + uint16x8_t row; + + row = vdupq_lane_u16(left_low, 0); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 1); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 2); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 3); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 0); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 1); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 2); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 3); + h_store_32(&dst, stride, row); + } +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x4_t above_s16d = vld1_s16((const int16_t *)above); + const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d); + const int16x4_t left_s16 = vld1_s16((const int16_t *)left); + const int16x8_t sub = vsubq_s16(above_s16, top_left); + int16x8_t sum; + uint16x8_t row; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1)); + sum = vaddq_s16(sum, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1_u16(dst, vget_low_u16(row)); + dst += stride; + vst1_u16(dst, vget_high_u16(row)); + dst += stride; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3)); + sum = vaddq_s16(sum, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1_u16(dst, vget_low_u16(row)); + dst += stride; + vst1_u16(dst, vget_high_u16(row)); +} + +static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub, + const int16x8_t max) { + uint16x8_t row; + int16x8_t sum = vaddq_s16(left_dup, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1q_u16(*dst, row); + *dst += stride; +} + +void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above_s16 = vld1q_s16((const int16_t *)above); + const int16x8_t left_s16 = vld1q_s16((const int16_t *)left); + const int16x8_t sub = vsubq_s16(above_s16, top_left); + int16x4_t left_s16d; + int16x8_t left_dup; + int i; + + left_s16d = vget_low_s16(left_s16); + + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_8_kernel(&dst, stride, left_dup, sub, max); + } +} + +static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t max) { + uint16x8_t row0, row1; + int16x8_t sum0 = vaddq_s16(left_dup, sub0); + int16x8_t sum1 = vaddq_s16(left_dup, sub1); + sum0 = vminq_s16(sum0, max); + sum1 = vminq_s16(sum1, max); + row0 = vqshluq_n_s16(sum0, 0); + row1 = vqshluq_n_s16(sum1, 0); + vst1q_u16(*dst, row0); + *dst += 8; + vst1q_u16(*dst, row1); + *dst += stride - 8; +} + +void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above0 = vld1q_s16((const int16_t *)above); + const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8)); + const int16x8_t sub0 = vsubq_s16(above0, top_left); + const int16x8_t sub1 = vsubq_s16(above1, top_left); + int16x8_t left_dup; + int i, j; + + for (j = 0; j < 2; j++, left += 8) { + const int16x8_t left_s16q = vld1q_s16((const int16_t *)left); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + } + } +} + +static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t sub2, + const int16x8_t sub3, const int16x8_t max) { + uint16x8_t row0, row1, row2, row3; + int16x8_t sum0 = vaddq_s16(left_dup, sub0); + int16x8_t sum1 = vaddq_s16(left_dup, sub1); + int16x8_t sum2 = vaddq_s16(left_dup, sub2); + int16x8_t sum3 = vaddq_s16(left_dup, sub3); + sum0 = vminq_s16(sum0, max); + sum1 = vminq_s16(sum1, max); + sum2 = vminq_s16(sum2, max); + sum3 = vminq_s16(sum3, max); + row0 = vqshluq_n_s16(sum0, 0); + row1 = vqshluq_n_s16(sum1, 0); + row2 = vqshluq_n_s16(sum2, 0); + row3 = vqshluq_n_s16(sum3, 0); + vst1q_u16(*dst, row0); + *dst += 8; + vst1q_u16(*dst, row1); + *dst += 8; + vst1q_u16(*dst, row2); + *dst += 8; + vst1q_u16(*dst, row3); + *dst += stride - 24; +} + +void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above0 = vld1q_s16((const int16_t *)above); + const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8)); + const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16)); + const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24)); + const int16x8_t sub0 = vsubq_s16(above0, top_left); + const int16x8_t sub1 = vsubq_s16(above1, top_left); + const int16x8_t sub2 = vsubq_s16(above2, top_left); + const int16x8_t sub3 = vsubq_s16(above3, top_left); + int16x8_t left_dup; + int i, j; + + for (i = 0; i < 4; i++, left += 8) { + const int16x8_t left_s16q = vld1q_s16((const int16_t *)left); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + } + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c new file mode 100644 index 0000000000..5530c6425b --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, uint16x8_t *blimit_vec, + uint16x8_t *limit_vec, uint16x8_t *thresh_vec, + const int bd) { + const int16x8_t shift = vdupq_n_s16(bd - 8); + *blimit_vec = vmovl_u8(vld1_dup_u8(blimit)); + *limit_vec = vmovl_u8(vld1_dup_u8(limit)); + *thresh_vec = vmovl_u8(vld1_dup_u8(thresh)); + *blimit_vec = vshlq_u16(*blimit_vec, shift); + *limit_vec = vshlq_u16(*limit_vec, shift); + *thresh_vec = vshlq_u16(*thresh_vec, shift); +} + +// Here flat is 128-bit long, with each 16-bit chunk being a mask of +// a pixel. When used to control filter branches, we only detect whether it is +// all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status. +// flat equals 0 if and only if flat_status equals 0. +// flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true +// because each mask occupies more than 1 bit.) +static INLINE uint32_t calc_flat_status(const uint16x8_t flat) { + const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)), + vreinterpret_u64_u16(vget_high_u16(flat))); + const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0)); + return vget_lane_u32(vreinterpret_u32_u64(t1), 0); +} + +static INLINE uint16x8_t +filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit, + const uint16x8_t thresh, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) { + uint16x8_t max, t0, t1; + + max = vabdq_u16(p1, p0); + max = vmaxq_u16(max, vabdq_u16(q1, q0)); + *hev = vcgtq_u16(max, thresh); + *mask = vmaxq_u16(max, vabdq_u16(p3, p2)); + *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1)); + *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1)); + *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2)); + t0 = vabdq_u16(p0, q0); + t1 = vabdq_u16(p1, q1); + t0 = vaddq_u16(t0, t0); + t1 = vshrq_n_u16(t1, 1); + t0 = vaddq_u16(t0, t1); + *mask = vcleq_u16(*mask, limit); + t0 = vcleq_u16(t0, blimit); + *mask = vandq_u16(*mask, t0); + + return max; +} + +static INLINE uint16x8_t filter_flat_hev_mask( + const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh, + const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat, + uint32_t *flat_status, uint16x8_t *hev, const int bd) { + uint16x8_t mask; + const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0, + q0, q1, q2, q3, hev, &mask); + *flat = vmaxq_u16(max, vabdq_u16(p2, p0)); + *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0)); + *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0)); + *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0)); + *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */ + *flat = vandq_u16(*flat, mask); + *flat_status = calc_flat_status(*flat); + + return mask; +} + +static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, const uint16x8_t q4, + const uint16x8_t flat, + uint32_t *flat2_status, const int bd) { + uint16x8_t flat2 = vabdq_u16(p4, p0); + flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0)); + flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8))); + flat2 = vandq_u16(flat2, flat); + *flat2_status = calc_flat_status(flat2); + + return flat2; +} + +static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) { + const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8)); + return vreinterpretq_s16_u16(vsubq_u16(v, offset)); +} + +static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) { + const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8)); + return vreinterpretq_u16_s16(vaddq_s16(v, offset)); +} + +static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1, + const uint16x8_t add0, const uint16x8_t add1, + uint16x8_t *sum) { + *sum = vsubq_u16(*sum, sub0); + *sum = vsubq_u16(*sum, sub1); + *sum = vaddq_u16(*sum, add0); + *sum = vaddq_u16(*sum, add1); +} + +static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0, + const uint16x8_t sub1, + const uint16x8_t add0, + const uint16x8_t add1, + uint16x8_t *sum) { + filter_update(sub0, sub1, add0, add1, sum); + return vrshrq_n_u16(*sum, 3); +} + +static INLINE uint16x8_t apply_15_tap_filter_kernel( + const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1, + const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in, + uint16x8_t *sum) { + filter_update(sub0, sub1, add0, add1, sum); + return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in); +} + +// 7-tap filter [1, 1, 1, 2, 1, 1, 1] +static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, uint16x8_t *oq2) { + uint16x8_t sum; + sum = vaddq_u16(p3, p3); // 2*p3 + sum = vaddq_u16(sum, p3); // 3*p3 + sum = vaddq_u16(sum, p2); // 3*p3+p2 + sum = vaddq_u16(sum, p2); // 3*p3+2*p2 + sum = vaddq_u16(sum, p1); // 3*p3+2*p2+p1 + sum = vaddq_u16(sum, p0); // 3*p3+2*p2+p1+p0 + sum = vaddq_u16(sum, q0); // 3*p3+2*p2+p1+p0+q0 + *op2 = vrshrq_n_u16(sum, 3); + *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum); + *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum); + *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum); + *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum); + *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum); +} + +static INLINE void apply_7_tap_filter(const uint16x8_t flat, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, uint16x8_t *oq2) { + uint16x8_t tp1, tp0, tq0, tq1; + calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1, + oq2); + *op2 = vbslq_u16(flat, *op2, p2); + *op1 = vbslq_u16(flat, tp1, *op1); + *op0 = vbslq_u16(flat, tp0, *op0); + *oq0 = vbslq_u16(flat, tq0, *oq0); + *oq1 = vbslq_u16(flat, tq1, *oq1); + *oq2 = vbslq_u16(flat, *oq2, q2); +} + +// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] +static INLINE void apply_15_tap_filter( + const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6, + const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5, + const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, + uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, + uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) { + uint16x8_t sum; + sum = vshlq_n_u16(p7, 3); // 8*p7 + sum = vsubq_u16(sum, p7); // 7*p7 + sum = vaddq_u16(sum, p6); // 7*p7+p6 + sum = vaddq_u16(sum, p6); // 7*p7+2*p6 + sum = vaddq_u16(sum, p5); // 7*p7+2*p6+p5 + sum = vaddq_u16(sum, p4); // 7*p7+2*p6+p5+p4 + sum = vaddq_u16(sum, p3); // 7*p7+2*p6+p5+p4+p3 + sum = vaddq_u16(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2 + sum = vaddq_u16(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum = vaddq_u16(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum = vaddq_u16(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6); + *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum); + *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum); + *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum); + *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum); + *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum); + *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum); + *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum); + *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum); + *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum); + *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum); + *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum); + *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum); + *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum); +} + +static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1); + const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1))); + int16x8_t filter, filter1, filter2, t; + int16x8_t ps1 = flip_sign(p1, bd); + int16x8_t ps0 = flip_sign(p0, bd); + int16x8_t qs0 = flip_sign(q0, bd); + int16x8_t qs1 = flip_sign(q1, bd); + + /* add outer taps if we have high edge variance */ + filter = vsubq_s16(ps1, qs1); + filter = vmaxq_s16(filter, min); + filter = vminq_s16(filter, max); + filter = vandq_s16(filter, vreinterpretq_s16_u16(hev)); + t = vsubq_s16(qs0, ps0); + + /* inner taps */ + filter = vaddq_s16(filter, t); + filter = vaddq_s16(filter, t); + filter = vaddq_s16(filter, t); + filter = vmaxq_s16(filter, min); + filter = vminq_s16(filter, max); + filter = vandq_s16(filter, vreinterpretq_s16_u16(mask)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ + /* we'd round it by 3 the other way */ + t = vaddq_s16(filter, vdupq_n_s16(4)); + t = vminq_s16(t, max); + filter1 = vshrq_n_s16(t, 3); + t = vaddq_s16(filter, vdupq_n_s16(3)); + t = vminq_s16(t, max); + filter2 = vshrq_n_s16(t, 3); + + qs0 = vsubq_s16(qs0, filter1); + qs0 = vmaxq_s16(qs0, min); + qs0 = vminq_s16(qs0, max); + ps0 = vaddq_s16(ps0, filter2); + ps0 = vmaxq_s16(ps0, min); + ps0 = vminq_s16(ps0, max); + *oq0 = flip_sign_back(qs0, bd); + *op0 = flip_sign_back(ps0, bd); + + /* outer tap adjustments */ + filter = vrshrq_n_s16(filter1, 1); + filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev)); + + qs1 = vsubq_s16(qs1, filter); + qs1 = vmaxq_s16(qs1, min); + qs1 = vminq_s16(qs1, max); + ps1 = vaddq_s16(ps1, filter); + ps1 = vmaxq_s16(ps1, min); + ps1 = vminq_s16(ps1, max); + *oq1 = flip_sign_back(qs1, bd); + *op1 = flip_sign_back(ps1, bd); +} + +static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat, + const uint32_t flat_status, const uint16x8_t hev, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0, + uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, + const int bd) { + if (flat_status != (uint32_t)-4) { + filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd); + *op2 = p2; + *oq2 = q2; + if (flat_status) { + apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, + oq0, oq1, oq2); + } + } else { + calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1, + oq2); + } +} + +static INLINE void filter16( + const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status, + const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev, + const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5, + const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3, + const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6, + const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4, + uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0, + uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3, + uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) { + if (flat_status != (uint32_t)-4) { + filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd); + } + + if (flat_status) { + *op2 = p2; + *oq2 = q2; + if (flat2_status != (uint32_t)-4) { + apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, + oq0, oq1, oq2); + } + if (flat2_status) { + apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, + q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, + oq0, oq1, oq2, oq3, oq4, oq5, oq6); + } + } +} + +static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3, + uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0, + uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2, + uint16x8_t *q3) { + *p3 = vld1q_u16(s); + s += p; + *p2 = vld1q_u16(s); + s += p; + *p1 = vld1q_u16(s); + s += p; + *p0 = vld1q_u16(s); + s += p; + *q0 = vld1q_u16(s); + s += p; + *q1 = vld1q_u16(s); + s += p; + *q2 = vld1q_u16(s); + s += p; + *q3 = vld1q_u16(s); +} + +static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0, + uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3, + uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6, + uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9, + uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12, + uint16x8_t *s13, uint16x8_t *s14, + uint16x8_t *s15) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); + s += p; + *s8 = vld1q_u16(s); + s += p; + *s9 = vld1q_u16(s); + s += p; + *s10 = vld1q_u16(s); + s += p; + *s11 = vld1q_u16(s); + s += p; + *s12 = vld1q_u16(s); + s += p; + *s13 = vld1q_u16(s); + s += p; + *s14 = vld1q_u16(s); + s += p; + *s15 = vld1q_u16(s); +} + +static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); +} + +static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); + s += p; + vst1q_u16(s, s4); + s += p; + vst1q_u16(s, s5); +} + +static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1) { + uint16x8x4_t o; + + o.val[0] = p1; + o.val[1] = p0; + o.val[2] = q0; + o.val[3] = q1; + vst4q_lane_u16(s, o, 0); + s += p; + vst4q_lane_u16(s, o, 1); + s += p; + vst4q_lane_u16(s, o, 2); + s += p; + vst4q_lane_u16(s, o, 3); + s += p; + vst4q_lane_u16(s, o, 4); + s += p; + vst4q_lane_u16(s, o, 5); + s += p; + vst4q_lane_u16(s, o, 6); + s += p; + vst4q_lane_u16(s, o, 7); +} + +static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5) { + uint16x8x3_t o0, o1; + + o0.val[0] = s0; + o0.val[1] = s1; + o0.val[2] = s2; + o1.val[0] = s3; + o1.val[1] = s4; + o1.val[2] = s5; + vst3q_lane_u16(s - 3, o0, 0); + vst3q_lane_u16(s + 0, o1, 0); + s += p; + vst3q_lane_u16(s - 3, o0, 1); + vst3q_lane_u16(s + 0, o1, 1); + s += p; + vst3q_lane_u16(s - 3, o0, 2); + vst3q_lane_u16(s + 0, o1, 2); + s += p; + vst3q_lane_u16(s - 3, o0, 3); + vst3q_lane_u16(s + 0, o1, 3); + s += p; + vst3q_lane_u16(s - 3, o0, 4); + vst3q_lane_u16(s + 0, o1, 4); + s += p; + vst3q_lane_u16(s - 3, o0, 5); + vst3q_lane_u16(s + 0, o1, 5); + s += p; + vst3q_lane_u16(s - 3, o0, 6); + vst3q_lane_u16(s + 0, o1, 6); + s += p; + vst3q_lane_u16(s - 3, o0, 7); + vst3q_lane_u16(s + 0, o1, 7); +} + +static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5, const uint16x8_t s6) { + uint16x8x4_t o0; + uint16x8x3_t o1; + + o0.val[0] = s0; + o0.val[1] = s1; + o0.val[2] = s2; + o0.val[3] = s3; + o1.val[0] = s4; + o1.val[1] = s5; + o1.val[2] = s6; + vst4q_lane_u16(s - 4, o0, 0); + vst3q_lane_u16(s + 0, o1, 0); + s += p; + vst4q_lane_u16(s - 4, o0, 1); + vst3q_lane_u16(s + 0, o1, 1); + s += p; + vst4q_lane_u16(s - 4, o0, 2); + vst3q_lane_u16(s + 0, o1, 2); + s += p; + vst4q_lane_u16(s - 4, o0, 3); + vst3q_lane_u16(s + 0, o1, 3); + s += p; + vst4q_lane_u16(s - 4, o0, 4); + vst3q_lane_u16(s + 0, o1, 4); + s += p; + vst4q_lane_u16(s - 4, o0, 5); + vst3q_lane_u16(s + 0, o1, 5); + s += p; + vst4q_lane_u16(s - 4, o0, 6); + vst3q_lane_u16(s + 0, o1, 6); + s += p; + vst4q_lane_u16(s - 4, o0, 7); + vst3q_lane_u16(s + 0, o1, 7); +} + +static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6, + const uint16x8_t p5, const uint16x8_t p4, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + const uint16x8_t q4, const uint16x8_t q5, + const uint16x8_t q6, const uint32_t flat_status, + const uint32_t flat2_status) { + if (flat_status) { + if (flat2_status) { + vst1q_u16(s - 7 * p, p6); + vst1q_u16(s - 6 * p, p5); + vst1q_u16(s - 5 * p, p4); + vst1q_u16(s - 4 * p, p3); + vst1q_u16(s + 3 * p, q3); + vst1q_u16(s + 4 * p, q4); + vst1q_u16(s + 5 * p, q5); + vst1q_u16(s + 6 * p, q6); + } + vst1q_u16(s - 3 * p, p2); + vst1q_u16(s + 2 * p, q2); + } + vst1q_u16(s - 2 * p, p1); + vst1q_u16(s - 1 * p, p0); + vst1q_u16(s + 0 * p, q0); + vst1q_u16(s + 1 * p, q1); +} + +void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd); + store_8x4(s - 2 * p, p, p1, p0, q0, q1); +} + +void vpx_highbd_lpf_horizontal_4_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1, + (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1, + (int16x8_t *)&q2, (int16x8_t *)&q3); + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd); + store_4x8(s - 2, p, p1, p0, q0, q1); +} + +void vpx_highbd_lpf_vertical_4_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, bd); + store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_highbd_lpf_horizontal_8_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1, + (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1, + (int16x8_t *)&q2, (int16x8_t *)&q3); + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, bd); + // Note: store_6x8() is faster than transpose + store_8x8(). + store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_highbd_lpf_vertical_8_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd); +} + +static void lpf_horizontal_16_kernel(uint16_t *s, int p, + const uint16x8_t blimit_vec, + const uint16x8_t limit_vec, + const uint16x8_t thresh_vec, + const int bd) { + uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, + q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2, + &q3, &q4, &q5, &q6, &q7); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, + &flat2_status, bd); + filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4, + p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, + &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + bd); + store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, + oq5, oq6, flat_status, flat2_status); +} + +static void lpf_vertical_16_kernel(uint16_t *s, int p, + const uint16x8_t blimit_vec, + const uint16x8_t limit_vec, + const uint16x8_t thresh_vec, const int bd) { + uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, + q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0); + transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5, + (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2, + (int16x8_t *)&p1, (int16x8_t *)&p0); + load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2, + (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5, + (int16x8_t *)&q6, (int16x8_t *)&q7); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, + &flat2_status, bd); + filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4, + p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, + &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + bd); + if (flat_status) { + if (flat2_status) { + store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0); + store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6); + } else { + // Note: store_6x8() is faster than transpose + store_8x8(). + store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2); + } + } else { + store_4x8(s - 2, p, op1, op0, oq0, oq1); + } +} + +void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); + lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); + lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd); +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c new file mode 100644 index 0000000000..c46c016312 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -0,0 +1,931 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_ports/mem.h" + +static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); +} + +static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); +} + +static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6, int16x8_t *const s7) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); +} + +static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3, + const uint16x8_t s4, const uint16x8_t s5, + const uint16x8_t s6, const uint16x8_t s7) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); + s += p; + vst1q_u16(s, s4); + s += p; + vst1q_u16(s, s5); + s += p; + vst1q_u16(s, s6); + s += p; + vst1q_u16(s, s7); +} + +static INLINE int32x4_t highbd_convolve8_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int32x4_t sum; + + sum = vmull_lane_s16(s0, filters_lo, 0); + sum = vmlal_lane_s16(sum, s1, filters_lo, 1); + sum = vmlal_lane_s16(sum, s2, filters_lo, 2); + sum = vmlal_lane_s16(sum, s3, filters_lo, 3); + sum = vmlal_lane_s16(sum, s4, filters_hi, 0); + sum = vmlal_lane_s16(sum, s5, filters_hi, 1); + sum = vmlal_lane_s16(sum, s6, filters_hi, 2); + sum = vmlal_lane_s16(sum, s7, filters_hi, 3); + return sum; +} + +static INLINE uint16x8_t +highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters, const uint16x8_t max) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int32x4_t sum0, sum1; + uint16x8_t d; + + sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3); + sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3); + d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7)); + d = vminq_u16(d, max); + return d; +} + +void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + } else { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t t0, t1, t2, t3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3; + + if (h == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23; + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + s0 = vreinterpret_s16_u16(vget_low_u16(t0)); + s1 = vreinterpret_s16_u16(vget_low_u16(t1)); + s2 = vreinterpret_s16_u16(vget_low_u16(t2)); + s3 = vreinterpret_s16_u16(vget_low_u16(t3)); + s4 = vreinterpret_s16_u16(vget_high_u16(t0)); + s5 = vreinterpret_s16_u16(vget_high_u16(t1)); + s6 = vreinterpret_s16_u16(vget_high_u16(t2)); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); + transpose_s16_4x4d(&s7, &s8, &s9, &s10); + + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + d01 = vminq_u16(d01, max); + d23 = vminq_u16(d23, max); + transpose_u16_4x4q(&d01, &d23); + + vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); + vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); + vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); + vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { + int16x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3; + + if (w == 4) { + do { + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, + &t4, &t5, &t6, &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + transpose_u16_8x4(&d0, &d1, &d2, &d3); + vst1_u16(dst, vget_low_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d3)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d3)); + dst += dst_stride; + h -= 8; + } while (h > 0); + } else { + int width; + const uint16_t *s; + uint16_t *d; + int16x8_t s11, s12, s13, s14; + uint16x8_t d4, d5, d6, d7; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, + &s12, &s13, &s14); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); + + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, + max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, + max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, + max); + d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, + max); + d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, + max); + d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, + max); + d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, + max); + d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, + filters, max); + + transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } + } +} + +void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + } else { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t t0, t1, t2, t3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3; + + if (h == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23, t01, t23; + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + s0 = vreinterpret_s16_u16(vget_low_u16(t0)); + s1 = vreinterpret_s16_u16(vget_low_u16(t1)); + s2 = vreinterpret_s16_u16(vget_low_u16(t2)); + s3 = vreinterpret_s16_u16(vget_low_u16(t3)); + s4 = vreinterpret_s16_u16(vget_high_u16(t0)); + s5 = vreinterpret_s16_u16(vget_high_u16(t1)); + s6 = vreinterpret_s16_u16(vget_high_u16(t2)); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); + transpose_s16_4x4d(&s7, &s8, &s9, &s10); + + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + t01 = vminq_u16(t01, max); + t23 = vminq_u16(t23, max); + transpose_u16_4x4q(&t01, &t23); + + d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 2 * dst_stride)); + d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), + vld1_u16(dst + 3 * dst_stride)); + d01 = vrhaddq_u16(d01, t01); + d23 = vrhaddq_u16(d23, t23); + + vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); + vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); + vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); + vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { + int16x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; + + if (w == 4) { + do { + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, + &t4, &t5, &t6, &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + t3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + + d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 4 * dst_stride)); + d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), + vld1_u16(dst + 5 * dst_stride)); + d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), + vld1_u16(dst + 6 * dst_stride)); + d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride), + vld1_u16(dst + 7 * dst_stride)); + d0 = vrhaddq_u16(d0, t0); + d1 = vrhaddq_u16(d1, t1); + d2 = vrhaddq_u16(d2, t2); + d3 = vrhaddq_u16(d3, t3); + + vst1_u16(dst, vget_low_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d3)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d3)); + dst += dst_stride; + h -= 8; + } while (h > 0); + } else { + int width; + const uint16_t *s; + uint16_t *d; + int16x8_t s11, s12, s13, s14; + uint16x8_t d4, d5, d6, d7; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, + &s12, &s13, &s14); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); + + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, + max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, + max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, + max); + d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, + max); + d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, + max); + d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, + max); + d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, + max); + d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, + filters, max); + + transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride)); + d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride)); + d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride)); + d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride)); + + store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } + } +} + +void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + } else { + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3 * src_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23; + + s0 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s1 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s2 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s3 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s4 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s5 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s6 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s8 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s9 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s10 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + d01 = vminq_u16(d01, max); + d23 = vminq_u16(d23, max); + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d23)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d23)); + dst += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; + } while (h > 0); + } else { + int height; + const uint16_t *s; + uint16_t *d; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s1 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s2 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s3 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s4 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s5 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s6 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + d = dst; + height = h; + + do { + s7 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s8 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s9 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s10 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + vst1q_u16(d, d0); + d += dst_stride; + vst1q_u16(d, d1); + d += dst_stride; + vst1q_u16(d, d2); + d += dst_stride; + vst1q_u16(d, d3); + d += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } + } +} + +void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + } else { + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3 * src_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23, t01, t23; + + s0 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s1 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s2 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s3 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s4 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s5 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s6 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s8 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s9 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s10 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + t01 = vminq_u16(t01, max); + t23 = vminq_u16(t23, max); + + d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 1 * dst_stride)); + d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), + vld1_u16(dst + 3 * dst_stride)); + d01 = vrhaddq_u16(d01, t01); + d23 = vrhaddq_u16(d23, t23); + + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d23)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d23)); + dst += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; + } while (h > 0); + } else { + int height; + const uint16_t *s; + uint16_t *d; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s1 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s2 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s3 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s4 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s5 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s6 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + d = dst; + height = h; + + do { + s7 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s8 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s9 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s10 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + t3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + d0 = vld1q_u16(d + 0 * dst_stride); + d1 = vld1q_u16(d + 1 * dst_stride); + d2 = vld1q_u16(d + 2 * dst_stride); + d3 = vld1q_u16(d + 3 * dst_stride); + d0 = vrhaddq_u16(d0, t0); + d1 = vrhaddq_u16(d1, t1); + d2 = vrhaddq_u16(d2, t2); + d3 = vrhaddq_u16(d3, t3); + + vst1q_u16(d, d0); + d += dst_stride; + vst1q_u16(d, d1); + d += dst_stride; + vst1q_u16(d, d2); + d += dst_stride; + vst1q_u16(d, d3); + d += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c new file mode 100644 index 0000000000..765a054f8d --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + if (w < 8) { // avg4 + uint16x4_t s0, s1, d0, d1; + uint16x8_t s01, d01; + do { + s0 = vld1_u16(src); + d0 = vld1_u16(dst); + src += src_stride; + s1 = vld1_u16(src); + d1 = vld1_u16(dst + dst_stride); + src += src_stride; + s01 = vcombine_u16(s0, s1); + d01 = vcombine_u16(d0, d1); + d01 = vrhaddq_u16(s01, d01); + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 8) { // avg8 + uint16x8_t s0, s1, d0, d1; + do { + s0 = vld1q_u16(src); + d0 = vld1q_u16(dst); + src += src_stride; + s1 = vld1q_u16(src); + d1 = vld1q_u16(dst + dst_stride); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + + vst1q_u16(dst, d0); + dst += dst_stride; + vst1q_u16(dst, d1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w < 32) { // avg16 + uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h; + do { + s0l = vld1q_u16(src); + s0h = vld1q_u16(src + 8); + d0l = vld1q_u16(dst); + d0h = vld1q_u16(dst + 8); + src += src_stride; + s1l = vld1q_u16(src); + s1h = vld1q_u16(src + 8); + d1l = vld1q_u16(dst + dst_stride); + d1h = vld1q_u16(dst + dst_stride + 8); + src += src_stride; + + d0l = vrhaddq_u16(s0l, d0l); + d0h = vrhaddq_u16(s0h, d0h); + d1l = vrhaddq_u16(s1l, d1l); + d1h = vrhaddq_u16(s1h, d1h); + + vst1q_u16(dst, d0l); + vst1q_u16(dst + 8, d0h); + dst += dst_stride; + vst1q_u16(dst, d1l); + vst1q_u16(dst + 8, d1h); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 32) { // avg32 + uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + dst += dst_stride; + + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // avg64 + uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + + s0 = vld1q_u16(src + 32); + s1 = vld1q_u16(src + 40); + s2 = vld1q_u16(src + 48); + s3 = vld1q_u16(src + 56); + d0 = vld1q_u16(dst + 32); + d1 = vld1q_u16(dst + 40); + d2 = vld1q_u16(dst + 48); + d3 = vld1q_u16(dst + 56); + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst + 32, d0); + vst1q_u16(dst + 40, d1); + vst1q_u16(dst + 48, d2); + vst1q_u16(dst + 56, d3); + src += src_stride; + dst += dst_stride; + } while (--h); + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c new file mode 100644 index 0000000000..9d2752e097 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + if (w < 8) { // copy4 + do { + vst1_u16(dst, vld1_u16(src)); + src += src_stride; + dst += dst_stride; + vst1_u16(dst, vld1_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 8) { // copy8 + do { + vst1q_u16(dst, vld1q_u16(src)); + src += src_stride; + dst += dst_stride; + vst1q_u16(dst, vld1q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w < 32) { // copy16 + do { + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 4; + } while (h > 0); + } else if (w == 32) { // copy32 + do { + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 4; + } while (h > 0); + } else { // copy64 + do { + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + h -= 4; + } while (h > 0); + } +} diff --git a/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c new file mode 100644 index 0000000000..414ade3530 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + // + 1 to make it divisible by 4 + uint16_t temp[64 * 136]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + /* Filter starting 3 lines back. The neon implementation will ignore the given + * height and filter a multiple of 4 lines. Since this goes in to the temp + * buffer which has lots of extra room and is subsequently discarded this is + * safe if somewhat less than ideal. */ + vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height, bd); + + /* Step into the temp buffer 3 lines to get the actual frame data */ + vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + // + 1 to make it divisible by 4 + uint16_t temp[64 * 136]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + /* This implementation has the same issues as above. In addition, we only want + * to average the values after both passes. + */ + vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height, bd); + vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); +} diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm deleted file mode 100644 index dc459e20d9..0000000000 --- a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm +++ /dev/null @@ -1,198 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vpx_idct16x16_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vpx_idct16x16_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asr r0, r0, #6 ; >> 6 - - vdup.s16 q0, r0 ; duplicate a1 - mov r0, #8 - sub r2, #8 - - ; load destination data row0 - row3 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row4 - row7 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row8 - row11 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row12 - row15 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |vpx_idct16x16_1_add_neon| - - END diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c index 466b408893..bf5192a683 100644 --- a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -10,49 +10,68 @@ #include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, j, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - for (d1 = d2 = dest, i = 0; i < 4; i++) { - for (j = 0; j < 2; j++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - } - return; +static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a = vld1q_u8(*dest); + const uint8x16_t b = vqaddq_u8(a, res); + vst1q_u8(*dest, b); + *dest += stride; +} + +static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a = vld1q_u8(*dest); + const uint8x16_t b = vqsubq_u8(a, res); + vst1q_u8(*dest, b); + *dest += stride; +} + +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + + if (a1 >= 0) { + const uint8x16_t dc = create_dcq(a1); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + } else { + const uint8x16_t dc = create_dcq(-a1); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + } } diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm deleted file mode 100644 index 22a0c95941..0000000000 --- a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm +++ /dev/null @@ -1,1179 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vpx_idct16x16_256_add_neon_pass1| - EXPORT |vpx_idct16x16_256_add_neon_pass2| - EXPORT |vpx_idct16x16_10_add_neon_pass1| - EXPORT |vpx_idct16x16_10_add_neon_pass2| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_256_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64 = 3196 - mov r3, #0xc00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r12, #0x3e00 - add r12, #0xc5 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r12 ; duplicate cospi_4_64 - - ; preloading to avoid stall - ; generate cospi_12_64 = 13623 - mov r3, #0x3500 - add r3, #0x37 - - ; generate cospi_20_64 = 9102 - mov r12, #0x2300 - add r12, #0x8e - - ; step2[4] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; step2[4] * cospi_4_64 - vmull.s16 q5, d18, d1 - vmull.s16 q6, d19, d1 - - ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 - vmlal.s16 q5, d30, d0 - vmlal.s16 q6, d31, d0 - - vdup.16 d2, r3 ; duplicate cospi_12_64 - vdup.16 d3, r12 ; duplicate cospi_20_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q5, #14 ; >> 14 - vqrshrn.s32 d15, q6, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; generate cospi_24_64 = 6270 - mov r12, #0x1800 - add r12, #0x7e - - ; step2[5] * cospi_12_64 - vmull.s16 q2, d26, d2 - vmull.s16 q3, d27, d2 - - ; step2[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q15, d27, d3 - - ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q2, d22, d3 - vmlsl.s16 q3, d23, d3 - - ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q15, d23, d2 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q2, #14 ; >> 14 - vqrshrn.s32 d11, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q15, #14 ; >> 14 - - ; stage 4 - vdup.16 d30, r3 ; cospi_16_64 - - ; step1[0] * cospi_16_64 - vmull.s16 q2, d16, d30 - vmull.s16 q11, d17, d30 - - ; step1[1] * cospi_16_64 - vmull.s16 q0, d24, d30 - vmull.s16 q1, d25, d30 - - ; generate cospi_8_64 = 15137 - mov r3, #0x3b00 - add r3, #0x21 - - vdup.16 d30, r12 ; duplicate cospi_24_64 - vdup.16 d31, r3 ; duplicate cospi_8_64 - - ; temp1 = (step1[0] + step1[1]) * cospi_16_64 - vadd.s32 q3, q2, q0 - vadd.s32 q12, q11, q1 - - ; temp2 = (step1[0] - step1[1]) * cospi_16_64 - vsub.s32 q13, q2, q0 - vsub.s32 q1, q11, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d16, q3, #14 ; >> 14 - vqrshrn.s32 d17, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d18, q13, #14 ; >> 14 - vqrshrn.s32 d19, q1, #14 ; >> 14 - - ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - ; step1[2] * cospi_8_64 - vmull.s16 q0, d20, d31 - vmull.s16 q1, d21, d31 - - ; step1[2] * cospi_24_64 - vmull.s16 q12, d20, d30 - vmull.s16 q13, d21, d30 - - ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q0, d28, d30 - vmlal.s16 q1, d29, d30 - - ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q12, d28, d31 - vmlsl.s16 q13, d29, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d22, q0, #14 ; >> 14 - vqrshrn.s32 d23, q1, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d20, q12, #14 ; >> 14 - vqrshrn.s32 d21, q13, #14 ; >> 14 - - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; - vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; stage 5 - vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; - vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; - vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; - vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; - - vdup.16 d16, r3; ; duplicate cospi_16_64 - - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q6, q9, q11 - vsub.s32 q13, q10, q12 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q6, #14 ; >> 14 - vqrshrn.s32 d11, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d16}, [r1], r2 - vst1.64 {d17}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |vpx_idct16x16_256_add_neon_pass1| - -;void vpx_idct16x16_256_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_256_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate cospi_30_64 = 1606 - mov r3, #0x0600 - add r3, #0x46 - - ; generate cospi_2_64 = 16305 - mov r12, #0x3f00 - add r12, #0xb1 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d12, r3 ; duplicate cospi_30_64 - vdup.16 d13, r12 ; duplicate cospi_2_64 - - ; preloading to avoid stall - ; generate cospi_14_64 = 12665 - mov r3, #0x3100 - add r3, #0x79 - - ; generate cospi_18_64 = 10394 - mov r12, #0x2800 - add r12, #0x9a - - ; step1[8] * cospi_30_64 - vmull.s16 q2, d16, d12 - vmull.s16 q3, d17, d12 - - ; step1[8] * cospi_2_64 - vmull.s16 q1, d16, d13 - vmull.s16 q4, d17, d13 - - ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 - vmlsl.s16 q2, d30, d13 - vmlsl.s16 q3, d31, d13 - - ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 - vmlal.s16 q1, d30, d12 - vmlal.s16 q4, d31, d12 - - vdup.16 d30, r3 ; duplicate cospi_14_64 - vdup.16 d31, r12 ; duplicate cospi_18_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d0, q2, #14 ; >> 14 - vqrshrn.s32 d1, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q1, #14 ; >> 14 - vqrshrn.s32 d15, q4, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_22_64 = 7723 - mov r3, #0x1e00 - add r3, #0x2b - - ; generate cospi_10_64 = 14449 - mov r12, #0x3800 - add r12, #0x71 - - ; step1[9] * cospi_14_64 - vmull.s16 q2, d24, d30 - vmull.s16 q3, d25, d30 - - ; step1[9] * cospi_18_64 - vmull.s16 q4, d24, d31 - vmull.s16 q5, d25, d31 - - ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 - vmlsl.s16 q2, d22, d31 - vmlsl.s16 q3, d23, d31 - - ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 - vmlal.s16 q4, d22, d30 - vmlal.s16 q5, d23, d30 - - vdup.16 d30, r3 ; duplicate cospi_22_64 - vdup.16 d31, r12 ; duplicate cospi_10_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q2, #14 ; >> 14 - vqrshrn.s32 d3, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q4, #14 ; >> 14 - vqrshrn.s32 d13, q5, #14 ; >> 14 - - ; step1[10] * cospi_22_64 - vmull.s16 q11, d20, d30 - vmull.s16 q12, d21, d30 - - ; step1[10] * cospi_10_64 - vmull.s16 q4, d20, d31 - vmull.s16 q5, d21, d31 - - ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 - vmlsl.s16 q11, d26, d31 - vmlsl.s16 q12, d27, d31 - - ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 - vmlal.s16 q4, d26, d30 - vmlal.s16 q5, d27, d30 - - ; preloading to avoid stall - ; generate cospi_6_64 = 15679 - mov r3, #0x3d00 - add r3, #0x3f - - ; generate cospi_26_64 = 4756 - mov r12, #0x1200 - add r12, #0x94 - - vdup.16 d30, r3 ; duplicate cospi_6_64 - vdup.16 d31, r12 ; duplicate cospi_26_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d11, q5, #14 ; >> 14 - vqrshrn.s32 d10, q4, #14 ; >> 14 - - ; step1[11] * cospi_6_64 - vmull.s16 q10, d28, d30 - vmull.s16 q11, d29, d30 - - ; step1[11] * cospi_26_64 - vmull.s16 q12, d28, d31 - vmull.s16 q13, d29, d31 - - ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 - vmlsl.s16 q10, d18, d31 - vmlsl.s16 q11, d19, d31 - - ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 - vmlal.s16 q12, d18, d30 - vmlal.s16 q13, d19, d30 - - vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] - vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q11, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q12, #14 ; >> 14 - vqrshrn.s32 d9, q13, #14 ; >> 14 - - ; stage 3 - vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] - vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] - vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] - vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] - vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] - vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - - ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vdup.16 d30, r12 ; duplicate cospi_8_64 - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d18, d31 - vmull.s16 q3, d19, d31 - - ; step1[14] * cospi_24_64 - vmull.s16 q4, d28, d31 - vmull.s16 q5, d29, d31 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d28, d30 - vmlal.s16 q3, d29, d30 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q4, d18, d30 - vmlsl.s16 q5, d19, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q4, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - vmov.s16 q3, q11 - vmov.s16 q4, q12 - - ; - step1[13] * cospi_8_64 - vmull.s16 q11, d26, d30 - vmull.s16 q12, d27, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d20, d30 - vmull.s16 q9, d21, d30 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlsl.s16 q11, d20, d31 - vmlsl.s16 q12, d21, d31 - - ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d26, d31 - vmlal.s16 q9, d27, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q10, q3, q0 - vadd.s32 q4, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q10, #14 ; >> 14 - vqrshrn.s32 d11, q4, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - cmp r3, #0 ; check if need adding dest data - beq skip_adding_dest - - ldr r7, [sp, #28] ; dest used to save element 0-7 - mov r9, r7 ; save dest pointer for later use - ldr r8, [sp, #32] ; load dest_stride - - ; stage 7 - ; load the data in pass1 - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - - ; store the data output 8,9,10,11,12,13,14,15 - vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q8 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q9, q9, #6 - vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q9 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q2, q2, #6 - vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q2 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q3, q3, #6 - vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q3 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q4, q4, #6 - vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q4 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q5, q5, #6 - vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q5 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q14, q14, #6 - vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q14 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q15, q15, #6 - vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q15 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - b end_idct16x16_pass2 - -skip_adding_dest - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |vpx_idct16x16_256_add_neon_pass2| - -;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_10_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64*2 = 6392 - mov r3, #0x1800 - add r3, #0xf8 - - ; generate cospi_4_64*2 = 32138 - mov r12, #0x7d00 - add r12, #0x8a - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q0, r3 ; duplicate cospi_28_64*2 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, - ; double, and return the high 16 bits, effectively giving >> 15. Doubling - ; the constant will change this to >> 14. - ; dct_const_round_shift(step2[4] * cospi_28_64); - vqrdmulh.s16 q4, q9, q0 - - ; preloading to avoid stall - ; generate cospi_16_64*2 = 23170 - mov r3, #0x5a00 - add r3, #0x82 - - ; dct_const_round_shift(step2[4] * cospi_4_64); - vqrdmulh.s16 q7, q9, q1 - - ; stage 4 - vdup.16 q1, r3 ; cospi_16_64*2 - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - vdup.16 d4, r3; ; duplicate cospi_16_64 - - ; dct_const_round_shift(step1[0] * cospi_16_64) - vqrdmulh.s16 q8, q8, q1 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d14, d4 - vmull.s16 q10, d15, d4 - - ; step2[5] * cospi_16_64 - vmull.s16 q12, d9, d4 - vmull.s16 q11, d8, d4 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q15, q10, q12 - vsub.s32 q6, q9, q11 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d11, q15, #14 ; >> 14 - vqrshrn.s32 d10, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; - vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; - vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d4}, [r1], r2 - vst1.64 {d5}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |vpx_idct16x16_10_add_neon_pass1| - -;void vpx_idct16x16_10_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_10_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate 2*cospi_30_64 = 3212 - mov r3, #0xc00 - add r3, #0x8c - - ; generate 2*cospi_2_64 = 32610 - mov r12, #0x7f00 - add r12, #0x62 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q6, r3 ; duplicate 2*cospi_30_64 - - ; dct_const_round_shift(step1[8] * cospi_30_64) - vqrdmulh.s16 q0, q8, q6 - - vdup.16 q6, r12 ; duplicate 2*cospi_2_64 - - ; dct_const_round_shift(step1[8] * cospi_2_64) - vqrdmulh.s16 q7, q8, q6 - - ; preloading to avoid stall - ; generate 2*cospi_26_64 = 9512 - mov r12, #0x2500 - add r12, #0x28 - rsb r12, #0 - vdup.16 q15, r12 ; duplicate -2*cospi_26_64 - - ; generate 2*cospi_6_64 = 31358 - mov r3, #0x7a00 - add r3, #0x7e - vdup.16 q14, r3 ; duplicate 2*cospi_6_64 - - ; dct_const_round_shift(- step1[12] * cospi_26_64) - vqrdmulh.s16 q3, q9, q15 - - ; dct_const_round_shift(step1[12] * cospi_6_64) - vqrdmulh.s16 q4, q9, q14 - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - vdup.16 d30, r12 ; duplicate cospi_8_64 - - ; step1[14] * cospi_24_64 - vmull.s16 q12, d14, d31 - vmull.s16 q5, d15, d31 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d0, d31 - vmull.s16 q11, d1, d31 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q12, d0, d30 - vmlsl.s16 q5, d1, d30 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d14, d30 - vmlal.s16 q11, d15, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q12, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q11, #14 ; >> 14 - - ; - step1[13] * cospi_8_64 - vmull.s16 q10, d8, d30 - vmull.s16 q13, d9, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d6, d30 - vmull.s16 q9, d7, d30 - - ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 - vmlsl.s16 q10, d6, d31 - vmlsl.s16 q13, d7, d31 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d8, d31 - vmlal.s16 q9, d9, d31 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q10, #14 ; >> 14 - vqrshrn.s32 d5, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q0, q3, q0 - vadd.s32 q1, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q0, #14 ; >> 14 - vqrshrn.s32 d11, q1, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct10_16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |vpx_idct16x16_10_add_neon_pass2| - END diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c index 5ca2afe488..5c5963d277 100644 --- a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -10,1222 +10,814 @@ #include -#include "./vpx_config.h" -#include "vpx_dsp/arm/transpose_neon.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" -void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, - int output_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d18s16, d1s16); - q6s32 = vmull_s16(d19s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlal_s16(q5s32, d30s16, d0s16); - q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q5s32, 14); - d15s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - q2s32 = vmull_s16(d26s16, d2s16); - q3s32 = vmull_s16(d27s16, d2s16); - q9s32 = vmull_s16(d26s16, d3s16); - q15s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q15s32 = vmlal_s16(q15s32, d23s16, d2s16); - - d10s16 = vqrshrn_n_s32(q2s32, 14); - d11s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q15s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - d30s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d30s16); - q11s32 = vmull_s16(d17s16, d30s16); - q0s32 = vmull_s16(d24s16, d30s16); - q1s32 = vmull_s16(d25s16, d30s16); - - d30s16 = vdup_n_s16(cospi_24_64); - d31s16 = vdup_n_s16(cospi_8_64); - - q3s32 = vaddq_s32(q2s32, q0s32); - q12s32 = vaddq_s32(q11s32, q1s32); - q13s32 = vsubq_s32(q2s32, q0s32); - q1s32 = vsubq_s32(q11s32, q1s32); - - d16s16 = vqrshrn_n_s32(q3s32, 14); - d17s16 = vqrshrn_n_s32(q12s32, 14); - d18s16 = vqrshrn_n_s32(q13s32, 14); - d19s16 = vqrshrn_n_s32(q1s32, 14); - q8s16 = vcombine_s16(d16s16, d17s16); - q9s16 = vcombine_s16(d18s16, d19s16); - - q0s32 = vmull_s16(d20s16, d31s16); - q1s32 = vmull_s16(d21s16, d31s16); - q12s32 = vmull_s16(d20s16, d30s16); - q13s32 = vmull_s16(d21s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d30s16); - q1s32 = vmlal_s16(q1s32, d29s16, d30s16); - q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); - - d22s16 = vqrshrn_n_s32(q0s32, 14); - d23s16 = vqrshrn_n_s32(q1s32, 14); - d20s16 = vqrshrn_n_s32(q12s32, 14); - d21s16 = vqrshrn_n_s32(q13s32, 14); - q10s16 = vcombine_s16(d20s16, d21s16); - q11s16 = vcombine_s16(d22s16, d23s16); - - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q15s16 = vaddq_s16(q6s16, q7s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - // stage 5 - q0s16 = vaddq_s16(q8s16, q11s16); - q1s16 = vaddq_s16(q9s16, q10s16); - q2s16 = vsubq_s16(q9s16, q10s16); - q3s16 = vsubq_s16(q8s16, q11s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q11s32 = vmull_s16(d26s16, d16s16); - q12s32 = vmull_s16(d27s16, d16s16); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - - q6s32 = vsubq_s32(q9s32, q11s32); - q13s32 = vsubq_s32(q10s32, q12s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d10s16 = vqrshrn_n_s32(q6s32, 14); - d11s16 = vqrshrn_n_s32(q13s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q8s16 = vaddq_s16(q0s16, q15s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d16u64); - out += output_stride; - vst1_u64((uint64_t *)out, d17u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; +static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, + int16x4_t *const d1) { + *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS); } -void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride) { - uint8_t *d; - uint8x8_t d12u8, d13u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d24u64, d25u64, d26u64, d27u64; - int64x1_t d12s64, d13s64; - uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; - uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; +static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0, + const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int32x4_t *const t32) { + t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3); + t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3); + t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1); + t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1); +} - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; +static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, int16x4_t *const d1) { + int32x4_t t32[2]; - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); + wrap_low_4x2(t32, d0, d1); +} - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); +static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, + int16x4_t *const d1) { + int32x4_t t32[2]; - // stage 3 - d12s16 = vdup_n_s16(cospi_30_64); - d13s16 = vdup_n_s16(cospi_2_64); + idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); + t32[1] = vnegq_s32(t32[1]); + wrap_low_4x2(t32, d0, d1); +} - q2s32 = vmull_s16(d16s16, d12s16); - q3s32 = vmull_s16(d17s16, d12s16); - q1s32 = vmull_s16(d16s16, d13s16); - q4s32 = vmull_s16(d17s16, d13s16); +static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, + int16x4_t *const d1) { + int32x4_t t32[3]; - q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); - q1s32 = vmlal_s16(q1s32, d30s16, d12s16); - q4s32 = vmlal_s16(q4s32, d31s16, d12s16); + t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2); + t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); + t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); + wrap_low_4x2(t32, d0, d1); +} - d0s16 = vqrshrn_n_s32(q2s32, 14); - d1s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q1s32, 14); - d15s16 = vqrshrn_n_s32(q4s32, 14); - q0s16 = vcombine_s16(d0s16, d1s16); - q7s16 = vcombine_s16(d14s16, d15s16); +static INLINE void idct16x16_add_store(const int16x8_t *const out, + uint8_t *dest, const int stride) { + // Add the result to dest + idct16x16_add8x1(out[0], &dest, stride); + idct16x16_add8x1(out[1], &dest, stride); + idct16x16_add8x1(out[2], &dest, stride); + idct16x16_add8x1(out[3], &dest, stride); + idct16x16_add8x1(out[4], &dest, stride); + idct16x16_add8x1(out[5], &dest, stride); + idct16x16_add8x1(out[6], &dest, stride); + idct16x16_add8x1(out[7], &dest, stride); + idct16x16_add8x1(out[8], &dest, stride); + idct16x16_add8x1(out[9], &dest, stride); + idct16x16_add8x1(out[10], &dest, stride); + idct16x16_add8x1(out[11], &dest, stride); + idct16x16_add8x1(out[12], &dest, stride); + idct16x16_add8x1(out[13], &dest, stride); + idct16x16_add8x1(out[14], &dest, stride); + idct16x16_add8x1(out[15], &dest, stride); +} - d30s16 = vdup_n_s16(cospi_14_64); - d31s16 = vdup_n_s16(cospi_18_64); +static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest, + const int stride) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << 8) - 1); + out[0] = vrshrq_n_s16(out[0], 6); + out[1] = vrshrq_n_s16(out[1], 6); + out[2] = vrshrq_n_s16(out[2], 6); + out[3] = vrshrq_n_s16(out[3], 6); + out[4] = vrshrq_n_s16(out[4], 6); + out[5] = vrshrq_n_s16(out[5], 6); + out[6] = vrshrq_n_s16(out[6], 6); + out[7] = vrshrq_n_s16(out[7], 6); + out[8] = vrshrq_n_s16(out[8], 6); + out[9] = vrshrq_n_s16(out[9], 6); + out[10] = vrshrq_n_s16(out[10], 6); + out[11] = vrshrq_n_s16(out[11], 6); + out[12] = vrshrq_n_s16(out[12], 6); + out[13] = vrshrq_n_s16(out[13], 6); + out[14] = vrshrq_n_s16(out[14], 6); + out[15] = vrshrq_n_s16(out[15], 6); + highbd_idct16x16_add8x1(out[0], max, &dest, stride); + highbd_idct16x16_add8x1(out[1], max, &dest, stride); + highbd_idct16x16_add8x1(out[2], max, &dest, stride); + highbd_idct16x16_add8x1(out[3], max, &dest, stride); + highbd_idct16x16_add8x1(out[4], max, &dest, stride); + highbd_idct16x16_add8x1(out[5], max, &dest, stride); + highbd_idct16x16_add8x1(out[6], max, &dest, stride); + highbd_idct16x16_add8x1(out[7], max, &dest, stride); + highbd_idct16x16_add8x1(out[8], max, &dest, stride); + highbd_idct16x16_add8x1(out[9], max, &dest, stride); + highbd_idct16x16_add8x1(out[10], max, &dest, stride); + highbd_idct16x16_add8x1(out[11], max, &dest, stride); + highbd_idct16x16_add8x1(out[12], max, &dest, stride); + highbd_idct16x16_add8x1(out[13], max, &dest, stride); + highbd_idct16x16_add8x1(out[14], max, &dest, stride); + highbd_idct16x16_add8x1(out[15], max, &dest, stride); +} - q2s32 = vmull_s16(d24s16, d30s16); - q3s32 = vmull_s16(d25s16, d30s16); - q4s32 = vmull_s16(d24s16, d31s16); - q5s32 = vmull_s16(d25s16, d31s16); +void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0); + const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1); + const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1); + int16x8_t in[16], step1[16], step2[16], out[16]; - q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); - q4s32 = vmlal_s16(q4s32, d22s16, d30s16); - q5s32 = vmlal_s16(q5s32, d23s16, d30s16); - - d2s16 = vqrshrn_n_s32(q2s32, 14); - d3s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q4s32, 14); - d13s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(cospi_22_64); - d31s16 = vdup_n_s16(cospi_10_64); - - q11s32 = vmull_s16(d20s16, d30s16); - q12s32 = vmull_s16(d21s16, d30s16); - q4s32 = vmull_s16(d20s16, d31s16); - q5s32 = vmull_s16(d21s16, d31s16); - - q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); - q4s32 = vmlal_s16(q4s32, d26s16, d30s16); - q5s32 = vmlal_s16(q5s32, d27s16, d30s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d11s16 = vqrshrn_n_s32(q5s32, 14); - d10s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - d30s16 = vdup_n_s16(cospi_6_64); - d31s16 = vdup_n_s16(cospi_26_64); - - q10s32 = vmull_s16(d28s16, d30s16); - q11s32 = vmull_s16(d29s16, d30s16); - q12s32 = vmull_s16(d28s16, d31s16); - q13s32 = vmull_s16(d29s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); - q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); - q12s32 = vmlal_s16(q12s32, d18s16, d30s16); - q13s32 = vmlal_s16(q13s32, d19s16, d30s16); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q11s32, 14); - d8s16 = vqrshrn_n_s32(q12s32, 14); - d9s16 = vqrshrn_n_s32(q13s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 3 - q9s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q10s16 = vsubq_s16(q3s16, q2s16); - q11s16 = vaddq_s16(q2s16, q3s16); - q12s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q6s16, q7s16); - - // stage 4 - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - q2s32 = vmull_s16(d18s16, d31s16); - q3s32 = vmull_s16(d19s16, d31s16); - q4s32 = vmull_s16(d28s16, d31s16); - q5s32 = vmull_s16(d29s16, d31s16); - - q2s32 = vmlal_s16(q2s32, d28s16, d30s16); - q3s32 = vmlal_s16(q3s32, d29s16, d30s16); - q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); - - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q3s32, 14); - d2s16 = vqrshrn_n_s32(q4s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q3s16 = q11s16; - q4s16 = q12s16; - - d30s16 = vdup_n_s16(-cospi_8_64); - q11s32 = vmull_s16(d26s16, d30s16); - q12s32 = vmull_s16(d27s16, d30s16); - q8s32 = vmull_s16(d20s16, d30s16); - q9s32 = vmull_s16(d21s16, d30s16); - - q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); - q8s32 = vmlal_s16(q8s32, d26s16, d31s16); - q9s32 = vmlal_s16(q9s32, d27s16, d31s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16(cospi_16_64); - - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q10s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q10s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 - if (skip_adding != 0) { - d = dest; - // load the data in pass1 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - // store the data out 8,9,10,11,12,13,14,15 - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q8s16 = vrshrq_n_s16(q8s16, 6); - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q9s16 = vrshrq_n_s16(q9s16, 6); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q2s16 = vrshrq_n_s16(q2s16, 6); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q3s16 = vrshrq_n_s16(q3s16, 6); - q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q4s16 = vrshrq_n_s16(q4s16, 6); - q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q5s16 = vrshrq_n_s16(q5s16, 6); - q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q14s16 = vrshrq_n_s16(q14s16, 6); - q14u16 = - vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - q15s16 = vrshrq_n_s16(q15s16, 6); - q15u16 = - vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - } else { // skip_adding_dest - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + // Load input (16x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[8] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[9] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[10] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[11] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[12] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[13] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[14] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[7] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[15] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 8; + in[8] = vld1q_s16(inputT); + inputT += 8; + in[1] = vld1q_s16(inputT); + inputT += 8; + in[9] = vld1q_s16(inputT); + inputT += 8; + in[2] = vld1q_s16(inputT); + inputT += 8; + in[10] = vld1q_s16(inputT); + inputT += 8; + in[3] = vld1q_s16(inputT); + inputT += 8; + in[11] = vld1q_s16(inputT); + inputT += 8; + in[4] = vld1q_s16(inputT); + inputT += 8; + in[12] = vld1q_s16(inputT); + inputT += 8; + in[5] = vld1q_s16(inputT); + inputT += 8; + in[13] = vld1q_s16(inputT); + inputT += 8; + in[6] = vld1q_s16(inputT); + inputT += 8; + in[14] = vld1q_s16(inputT); + inputT += 8; + in[7] = vld1q_s16(inputT); + inputT += 8; + in[15] = vld1q_s16(inputT); } - return; -} -void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, - int output_stride) { - int16x4_t d4s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q15s32; - int16x8x2_t q0x2s16; + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; + // stage 1 + step1[0] = in[0 / 2]; + step1[1] = in[16 / 2]; + step1[2] = in[8 / 2]; + step1[3] = in[24 / 2]; + step1[4] = in[4 / 2]; + step1[5] = in[20 / 2]; + step1[6] = in[12 / 2]; + step1[7] = in[28 / 2]; + step1[8] = in[2 / 2]; + step1[9] = in[18 / 2]; + step1[10] = in[10 / 2]; + step1[11] = in[26 / 2]; + step1[12] = in[6 / 2]; + step1[13] = in[22 / 2]; + step1[14] = in[14 / 2]; + step1[15] = in[30 / 2]; - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]); + idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9], + &step2[14]); + idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], + &step2[13]); + idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11], + &step2[12]); // stage 3 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q7s16 = vqrdmulhq_s16(q9s16, q1s16); + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]); + idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]); + step1[8] = vaddq_s16(step2[8], step2[9]); + step1[9] = vsubq_s16(step2[8], step2[9]); + step1[10] = vsubq_s16(step2[11], step2[10]); + step1[11] = vaddq_s16(step2[11], step2[10]); + step1[12] = vaddq_s16(step2[12], step2[13]); + step1[13] = vsubq_s16(step2[12], step2[13]); + step1[14] = vsubq_s16(step2[15], step2[14]); + step1[15] = vaddq_s16(step2[15], step2[14]); // stage 4 - q1s16 = vdupq_n_s16(cospi_16_64 * 2); - d4s16 = vdup_n_s16(cospi_16_64); - - q8s16 = vqrdmulhq_s16(q8s16, q1s16); - - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - q9s32 = vmull_s16(d14s16, d4s16); - q10s32 = vmull_s16(d15s16, d4s16); - q12s32 = vmull_s16(d9s16, d4s16); - q11s32 = vmull_s16(d8s16, d4s16); - - q15s32 = vsubq_s32(q10s32, q12s32); - q6s32 = vsubq_s32(q9s32, q11s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d11s16 = vqrshrn_n_s32(q15s32, 14); - d10s16 = vqrshrn_n_s32(q6s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q2s16 = vaddq_s16(q8s16, q7s16); - q9s16 = vaddq_s16(q8s16, q6s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q8s16, q4s16); - q12s16 = vsubq_s16(q8s16, q4s16); - q13s16 = vsubq_s16(q8s16, q5s16); - q14s16 = vsubq_s16(q8s16, q6s16); - q15s16 = vsubq_s16(q8s16, q7s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d4u64); - out += output_stride; - vst1_u64((uint64_t *)out, d5u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} - -void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; - uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; - uint64x1_t d16u64, d17u64, d18u64, d19u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - (void)skip_adding; - (void)dest; - (void)dest_stride; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // stage 3 - q6s16 = vdupq_n_s16(cospi_30_64 * 2); - q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16(cospi_2_64 * 2); - q7s16 = vqrdmulhq_s16(q8s16, q6s16); - - q15s16 = vdupq_n_s16(-cospi_26_64 * 2); - q14s16 = vdupq_n_s16(cospi_6_64 * 2); - q3s16 = vqrdmulhq_s16(q9s16, q15s16); - q4s16 = vqrdmulhq_s16(q9s16, q14s16); - - // stage 4 - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - d6s16 = vget_low_s16(q3s16); - d7s16 = vget_high_s16(q3s16); - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - q12s32 = vmull_s16(d14s16, d31s16); - q5s32 = vmull_s16(d15s16, d31s16); - q2s32 = vmull_s16(d0s16, d31s16); - q11s32 = vmull_s16(d1s16, d31s16); - - q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); - q2s32 = vmlal_s16(q2s32, d14s16, d30s16); - q11s32 = vmlal_s16(q11s32, d15s16, d30s16); - - d2s16 = vqrshrn_n_s32(q12s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q11s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(-cospi_8_64); - q10s32 = vmull_s16(d8s16, d30s16); - q13s32 = vmull_s16(d9s16, d30s16); - q8s32 = vmull_s16(d6s16, d30s16); - q9s32 = vmull_s16(d7s16, d30s16); - - q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); - q8s32 = vmlal_s16(q8s32, d8s16, d31s16); - q9s32 = vmlal_s16(q9s32, d9s16, d31s16); - - d4s16 = vqrshrn_n_s32(q10s32, 14); - d5s16 = vqrshrn_n_s32(q13s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); + idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]); + idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]); + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16(cospi_16_64); - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q0s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q0s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; // stage 7 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); + idct16x16_add_stage7(step2, out); - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); - d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); - d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); - d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); - d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); - d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - vst1_u64((uint64_t *)out, d16u64); - out += 4; - vst1_u64((uint64_t *)out, d17u64); - out += 12; - vst1_u64((uint64_t *)out, d18u64); - out += 4; - vst1_u64((uint64_t *)out, d19u64); - out += 12; - vst1_u64((uint64_t *)out, d4u64); - out += 4; - vst1_u64((uint64_t *)out, d5u64); - out += 12; - vst1_u64((uint64_t *)out, d6u64); - out += 4; - vst1_u64((uint64_t *)out, d7u64); - out += 12; - vst1_u64((uint64_t *)out, d8u64); - out += 4; - vst1_u64((uint64_t *)out, d9u64); - out += 12; - vst1_u64((uint64_t *)out, d10u64); - out += 4; - vst1_u64((uint64_t *)out, d11u64); - out += 12; - vst1_u64((uint64_t *)out, d28u64); - out += 4; - vst1_u64((uint64_t *)out, d29u64); - out += 12; - vst1_u64((uint64_t *)out, d30u64); - out += 4; - vst1_u64((uint64_t *)out, d31u64); - return; + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x8_t in[8], step1[16], step2[16], out[16]; + + // Load input (8x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[7] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 16; + in[1] = vld1q_s16(inputT); + inputT += 16; + in[2] = vld1q_s16(inputT); + inputT += 16; + in[3] = vld1q_s16(inputT); + inputT += 16; + in[4] = vld1q_s16(inputT); + inputT += 16; + in[5] = vld1q_s16(inputT); + inputT += 16; + in[6] = vld1q_s16(inputT); + inputT += 16; + in[7] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[2] = in[8 / 2]; + step1[4] = in[4 / 2]; + step1[6] = in[12 / 2]; + step1[8] = in[2 / 2]; + step1[10] = in[10 / 2]; + step1[12] = in[6 / 2]; + step1[14] = in[14 / 2]; // 0 in pass 1 + + // stage 2 + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3); + step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3); + step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2); + step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2); + step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2); + step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1); + step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = vaddq_s16(step2[8], step2[9]); + step1[9] = vsubq_s16(step2[8], step2[9]); + step1[10] = vsubq_s16(step2[11], step2[10]); + step1[11] = vaddq_s16(step2[11], step2[10]); + step1[12] = vaddq_s16(step2[12], step2[13]); + step1[13] = vsubq_s16(step2[12], step2[13]); + step1[14] = vsubq_s16(step2[15], step2[14]); + step1[15] = vaddq_s16(step2[15], step2[14]); + + // stage 4 + step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3); + step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1); + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + idct16x16_add_stage7(step2, out); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int16_t *output) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x4_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x4) + in[0] = load_tran_low_to_s16d(input); + input += 16; + in[1] = load_tran_low_to_s16d(input); + input += 16; + in[2] = load_tran_low_to_s16d(input); + input += 16; + in[3] = load_tran_low_to_s16d(input); + + // Transpose + transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vadd_s16(step2[8], step2[11]); + step1[9] = vadd_s16(step2[9], step2[10]); + step1[10] = vsub_s16(step2[9], step2[10]); + step1[11] = vsub_s16(step2[8], step2[11]); + step1[12] = vsub_s16(step2[15], step2[12]); + step1[13] = vsub_s16(step2[14], step2[13]); + step1[14] = vadd_s16(step2[14], step2[13]); + step1[15] = vadd_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vadd_s16(step1[0], step1[7]); + step2[1] = vadd_s16(step1[1], step1[6]); + step2[2] = vadd_s16(step1[2], step1[5]); + step2[3] = vadd_s16(step1[3], step1[4]); + step2[4] = vsub_s16(step1[3], step1[4]); + step2[5] = vsub_s16(step1[2], step1[5]); + step2[6] = vsub_s16(step1[1], step1[6]); + step2[7] = vsub_s16(step1[0], step1[7]); + idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vadd_s16(step2[0], step2[15]); + out[1] = vadd_s16(step2[1], step2[14]); + out[2] = vadd_s16(step2[2], step2[13]); + out[3] = vadd_s16(step2[3], step2[12]); + out[4] = vadd_s16(step2[4], step2[11]); + out[5] = vadd_s16(step2[5], step2[10]); + out[6] = vadd_s16(step2[6], step2[9]); + out[7] = vadd_s16(step2[7], step2[8]); + out[8] = vsub_s16(step2[7], step2[8]); + out[9] = vsub_s16(step2[6], step2[9]); + out[10] = vsub_s16(step2[5], step2[10]); + out[11] = vsub_s16(step2[4], step2[11]); + out[12] = vsub_s16(step2[3], step2[12]); + out[13] = vsub_s16(step2[2], step2[13]); + out[14] = vsub_s16(step2[1], step2[14]); + out[15] = vsub_s16(step2[0], step2[15]); + + // pass 1: save the result into output + vst1_s16(output, out[0]); + output += 4; + vst1_s16(output, out[1]); + output += 4; + vst1_s16(output, out[2]); + output += 4; + vst1_s16(output, out[3]); + output += 4; + vst1_s16(output, out[4]); + output += 4; + vst1_s16(output, out[5]); + output += 4; + vst1_s16(output, out[6]); + output += 4; + vst1_s16(output, out[7]); + output += 4; + vst1_s16(output, out[8]); + output += 4; + vst1_s16(output, out[9]); + output += 4; + vst1_s16(output, out[10]); + output += 4; + vst1_s16(output, out[11]); + output += 4; + vst1_s16(output, out[12]); + output += 4; + vst1_s16(output, out[13]); + output += 4; + vst1_s16(output, out[14]); + output += 4; + vst1_s16(output, out[15]); +} + +void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input, + int16_t *const output, void *const dest, + const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x4_t ind[8]; + int16x8_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x8) + ind[0] = vld1_s16(input); + input += 4; + ind[1] = vld1_s16(input); + input += 4; + ind[2] = vld1_s16(input); + input += 4; + ind[3] = vld1_s16(input); + input += 4; + ind[4] = vld1_s16(input); + input += 4; + ind[5] = vld1_s16(input); + input += 4; + ind[6] = vld1_s16(input); + input += 4; + ind[7] = vld1_s16(input); + + // Transpose + transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6], + ind[7], &in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + idct16x16_add_stage7(step2, out); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0); + + // Parallel idct on the lower 8 rows + vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, + stride, 0); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, + 0); +} + +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, + 0); +} + +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, + stride, 0); } diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c deleted file mode 100644 index ecc263df28..0000000000 --- a/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_dsp/vpx_dsp_common.h" - -void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output, - int output_stride); -void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output, - int output_stride); -void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride); - -#if HAVE_NEON_ASM -/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ -extern void vpx_push_neon(int64_t *store); -extern void vpx_pop_neon(int64_t *store); -#endif // HAVE_NEON_ASM - -void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, - int dest_stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - vpx_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); - - /* Parallel idct on the lower 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8, - pass1_output, 0, dest, dest_stride); - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, dest_stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, dest_stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - vpx_pop_neon(store_reg); -#endif - - return; -} - -void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, - int dest_stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - vpx_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); - - /* Skip Parallel idct on the lower 8 rows as they are all 0s */ - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, dest_stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, dest_stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - vpx_pop_neon(store_reg); -#endif - - return; -} diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c new file mode 100644 index 0000000000..021211bc99 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -0,0 +1,678 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0, + int16x8_t *const in1, int16x8_t *const in2, + int16x8_t *const in3, int16x8_t *const in4, + int16x8_t *const in5, int16x8_t *const in6, + int16x8_t *const in7) { + *in0 = load_tran_low_to_s16q(input); + input += 32; + *in1 = load_tran_low_to_s16q(input); + input += 32; + *in2 = load_tran_low_to_s16q(input); + input += 32; + *in3 = load_tran_low_to_s16q(input); + input += 32; + *in4 = load_tran_low_to_s16q(input); + input += 32; + *in5 = load_tran_low_to_s16q(input); + input += 32; + *in6 = load_tran_low_to_s16q(input); + input += 32; + *in7 = load_tran_low_to_s16q(input); +} + +static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0, + int16x4_t *const in1, int16x4_t *const in2, + int16x4_t *const in3, int16x4_t *const in4, + int16x4_t *const in5, int16x4_t *const in6, + int16x4_t *const in7) { + *in0 = load_tran_low_to_s16d(input); + input += 32; + *in1 = load_tran_low_to_s16d(input); + input += 32; + *in2 = load_tran_low_to_s16d(input); + input += 32; + *in3 = load_tran_low_to_s16d(input); + input += 32; + *in4 = load_tran_low_to_s16d(input); + input += 32; + *in5 = load_tran_low_to_s16d(input); + input += 32; + *in6 = load_tran_low_to_s16d(input); + input += 32; + *in7 = load_tran_low_to_s16d(input); +} + +// Only for the first pass of the _135_ variant. Since it only uses values from +// the top left 16x16 it can safely assume all the remaining values are 0 and +// skip an awful lot of calculations. In fact, only the first 12 columns make +// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are +// used so it skips any calls to input[12|13|14|15] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 12x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// 0 0 2 5 10 17 25 38 47 62 83 101 121 +// 1 1 4 8 15 22 30 45 58 74 92 112 133 +// 2 3 7 12 18 28 36 52 64 82 102 118 +// 3 6 11 16 23 31 43 60 73 90 109 126 +// 4 9 14 19 29 37 50 65 78 98 116 134 +// 5 13 20 26 35 44 54 72 85 105 123 +// 6 21 27 33 42 53 63 80 94 113 132 +// 7 24 32 39 48 57 71 88 104 120 +// 8 34 40 46 56 68 81 96 111 130 +// 9 41 49 55 67 77 91 107 124 +// 10 51 59 66 76 89 99 119 131 +// 11 61 69 75 87 100 114 129 +// 12 70 79 86 97 108 122 +// 13 84 93 103 110 125 +// 14 98 106 115 127 +// 15 117 128 +void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) { + int16x4_t tmp[8]; + int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32]; + + load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5], + &tmp[6], &tmp[7]); + transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], + tmp[7], &in[8], &in[9], &in[10], &in[11]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); + + s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64); + + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); + + s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); + + s2[18] = vsubq_s16(s1[19], s1[18]); + s2[19] = vaddq_s16(s1[18], s1[19]); + s2[20] = vaddq_s16(s1[20], s1[21]); + s2[21] = vsubq_s16(s1[20], s1[21]); + s2[26] = vsubq_s16(s1[27], s1[26]); + s2[27] = vaddq_s16(s1[26], s1[27]); + s2[28] = vaddq_s16(s1[28], s1[29]); + s2[29] = vsubq_s16(s1[28], s1[29]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); + + s3[10] = vsubq_s16(s2[11], s2[10]); + s3[11] = vaddq_s16(s2[10], s2[11]); + s3[12] = vaddq_s16(s2[12], s2[13]); + s3[13] = vsubq_s16(s2[12], s2[13]); + + s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29], + cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26], + cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64); + + s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], + cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13], + cospi_24_64); + + s4[16] = vaddq_s16(s1[16], s2[19]); + s4[17] = vaddq_s16(s3[17], s3[18]); + s4[18] = vsubq_s16(s3[17], s3[18]); + s4[19] = vsubq_s16(s1[16], s2[19]); + s4[20] = vsubq_s16(s1[23], s2[20]); + s4[21] = vsubq_s16(s3[22], s3[21]); + s4[22] = vaddq_s16(s3[21], s3[22]); + s4[23] = vaddq_s16(s2[20], s1[23]); + s4[24] = vaddq_s16(s1[24], s2[27]); + s4[25] = vaddq_s16(s3[25], s3[26]); + s4[26] = vsubq_s16(s3[25], s3[26]); + s4[27] = vsubq_s16(s1[24], s2[27]); + s4[28] = vsubq_s16(s1[31], s2[28]); + s4[29] = vsubq_s16(s3[30], s3[29]); + s4[30] = vaddq_s16(s3[29], s3[30]); + s4[31] = vaddq_s16(s2[28], s1[31]); + + // stage 5 + s5[0] = vaddq_s16(s4[0], s4[3]); + s5[1] = vaddq_s16(s4[0], s4[2]); + s5[2] = vsubq_s16(s4[0], s4[2]); + s5[3] = vsubq_s16(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64); + + s5[8] = vaddq_s16(s2[8], s3[11]); + s5[9] = vaddq_s16(s4[9], s4[10]); + s5[10] = vsubq_s16(s4[9], s4[10]); + s5[11] = vsubq_s16(s2[8], s3[11]); + s5[12] = vsubq_s16(s2[15], s3[12]); + s5[13] = vsubq_s16(s4[14], s4[13]); + s5[14] = vaddq_s16(s4[13], s4[14]); + s5[15] = vaddq_s16(s2[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29], + cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29], + cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28], + cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28], + cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27], + cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26], + cospi_24_64); + + // stage 6 + s6[0] = vaddq_s16(s5[0], s3[7]); + s6[1] = vaddq_s16(s5[1], s5[6]); + s6[2] = vaddq_s16(s5[2], s5[5]); + s6[3] = vaddq_s16(s5[3], s3[4]); + s6[4] = vsubq_s16(s5[3], s3[4]); + s6[5] = vsubq_s16(s5[2], s5[5]); + s6[6] = vsubq_s16(s5[1], s5[6]); + s6[7] = vsubq_s16(s5[0], s3[7]); + + s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64); + + s6[16] = vaddq_s16(s4[16], s4[23]); + s6[17] = vaddq_s16(s4[17], s4[22]); + s6[18] = vaddq_s16(s5[18], s5[21]); + s6[19] = vaddq_s16(s5[19], s5[20]); + s6[20] = vsubq_s16(s5[19], s5[20]); + s6[21] = vsubq_s16(s5[18], s5[21]); + s6[22] = vsubq_s16(s4[17], s4[22]); + s6[23] = vsubq_s16(s4[16], s4[23]); + + s6[24] = vsubq_s16(s4[31], s4[24]); + s6[25] = vsubq_s16(s4[30], s4[25]); + s6[26] = vsubq_s16(s5[29], s5[26]); + s6[27] = vsubq_s16(s5[28], s5[27]); + s6[28] = vaddq_s16(s5[27], s5[28]); + s6[29] = vaddq_s16(s5[26], s5[29]); + s6[30] = vaddq_s16(s4[25], s4[30]); + s6[31] = vaddq_s16(s4[24], s4[31]); + + // stage 7 + s7[0] = vaddq_s16(s6[0], s5[15]); + s7[1] = vaddq_s16(s6[1], s5[14]); + s7[2] = vaddq_s16(s6[2], s6[13]); + s7[3] = vaddq_s16(s6[3], s6[12]); + s7[4] = vaddq_s16(s6[4], s6[11]); + s7[5] = vaddq_s16(s6[5], s6[10]); + s7[6] = vaddq_s16(s6[6], s5[9]); + s7[7] = vaddq_s16(s6[7], s5[8]); + s7[8] = vsubq_s16(s6[7], s5[8]); + s7[9] = vsubq_s16(s6[6], s5[9]); + s7[10] = vsubq_s16(s6[5], s6[10]); + s7[11] = vsubq_s16(s6[4], s6[11]); + s7[12] = vsubq_s16(s6[3], s6[12]); + s7[13] = vsubq_s16(s6[2], s6[13]); + s7[14] = vsubq_s16(s6[1], s5[14]); + s7[15] = vsubq_s16(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64); + + // final stage + vst1q_s16(output, vaddq_s16(s7[0], s6[31])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[1], s6[30])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[2], s6[29])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[3], s6[28])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[4], s7[27])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[5], s7[26])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[6], s7[25])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[7], s7[24])); + output += 16; + + vst1q_s16(output, vaddq_s16(s7[8], s7[23])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[9], s7[22])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[10], s7[21])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[11], s7[20])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[12], s6[19])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[13], s6[18])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[14], s6[17])); + output += 16; + vst1q_s16(output, vaddq_s16(s7[15], s6[16])); + output += 16; + + vst1q_s16(output, vsubq_s16(s7[15], s6[16])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[14], s6[17])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[13], s6[18])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[12], s6[19])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[11], s7[20])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[10], s7[21])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[9], s7[22])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[8], s7[23])); + output += 16; + + vst1q_s16(output, vsubq_s16(s7[7], s7[24])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[6], s7[25])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[5], s7[26])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[4], s7[27])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[3], s6[28])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[2], s6[29])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[1], s6[30])); + output += 16; + vst1q_s16(output, vsubq_s16(s7[0], s6[31])); +} + +void vpx_idct32_16_neon(const int16_t *const input, void *const output, + const int stride, const int highbd_flag) { + int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + out[32]; + + load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11], + &in[12], &in[13], &in[14], &in[15]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); + + s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64); + s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64); + + s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64); + + s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64); + s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64); + + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); + + s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64); + s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64); + + s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); + + s2[16] = vaddq_s16(s1[16], s1[17]); + s2[17] = vsubq_s16(s1[16], s1[17]); + s2[18] = vsubq_s16(s1[19], s1[18]); + s2[19] = vaddq_s16(s1[18], s1[19]); + s2[20] = vaddq_s16(s1[20], s1[21]); + s2[21] = vsubq_s16(s1[20], s1[21]); + s2[22] = vsubq_s16(s1[23], s1[22]); + s2[23] = vaddq_s16(s1[22], s1[23]); + s2[24] = vaddq_s16(s1[24], s1[25]); + s2[25] = vsubq_s16(s1[24], s1[25]); + s2[26] = vsubq_s16(s1[27], s1[26]); + s2[27] = vaddq_s16(s1[26], s1[27]); + s2[28] = vaddq_s16(s1[28], s1[29]); + s2[29] = vsubq_s16(s1[28], s1[29]); + s2[30] = vsubq_s16(s1[31], s1[30]); + s2[31] = vaddq_s16(s1[30], s1[31]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); + + s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64); + s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64); + + s3[8] = vaddq_s16(s2[8], s2[9]); + s3[9] = vsubq_s16(s2[8], s2[9]); + s3[10] = vsubq_s16(s2[11], s2[10]); + s3[11] = vaddq_s16(s2[10], s2[11]); + s3[12] = vaddq_s16(s2[12], s2[13]); + s3[13] = vsubq_s16(s2[12], s2[13]); + s3[14] = vsubq_s16(s2[15], s2[14]); + s3[15] = vaddq_s16(s2[14], s2[15]); + + s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30], + cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30], + cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29], + cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26], + cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64, + s2[25], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64, + s2[25], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64); + + s4[4] = vaddq_s16(s3[4], s3[5]); + s4[5] = vsubq_s16(s3[4], s3[5]); + s4[6] = vsubq_s16(s3[7], s3[6]); + s4[7] = vaddq_s16(s3[6], s3[7]); + + s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14], + cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14], + cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13], + cospi_24_64); + + s4[16] = vaddq_s16(s2[16], s2[19]); + s4[17] = vaddq_s16(s3[17], s3[18]); + s4[18] = vsubq_s16(s3[17], s3[18]); + s4[19] = vsubq_s16(s2[16], s2[19]); + s4[20] = vsubq_s16(s2[23], s2[20]); + s4[21] = vsubq_s16(s3[22], s3[21]); + s4[22] = vaddq_s16(s3[21], s3[22]); + s4[23] = vaddq_s16(s2[20], s2[23]); + s4[24] = vaddq_s16(s2[24], s2[27]); + s4[25] = vaddq_s16(s3[25], s3[26]); + s4[26] = vsubq_s16(s3[25], s3[26]); + s4[27] = vsubq_s16(s2[24], s2[27]); + s4[28] = vsubq_s16(s2[31], s2[28]); + s4[29] = vsubq_s16(s3[30], s3[29]); + s4[30] = vaddq_s16(s3[29], s3[30]); + s4[31] = vaddq_s16(s2[28], s2[31]); + + // stage 5 + s5[0] = vaddq_s16(s4[0], s4[3]); + s5[1] = vaddq_s16(s4[0], s4[2]); + s5[2] = vsubq_s16(s4[0], s4[2]); + s5[3] = vsubq_s16(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64); + + s5[8] = vaddq_s16(s3[8], s3[11]); + s5[9] = vaddq_s16(s4[9], s4[10]); + s5[10] = vsubq_s16(s4[9], s4[10]); + s5[11] = vsubq_s16(s3[8], s3[11]); + s5[12] = vsubq_s16(s3[15], s3[12]); + s5[13] = vsubq_s16(s4[14], s4[13]); + s5[14] = vaddq_s16(s4[13], s4[14]); + s5[15] = vaddq_s16(s3[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29], + cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29], + cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28], + cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28], + cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27], + cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26], + cospi_24_64); + + // stage 6 + s6[0] = vaddq_s16(s5[0], s4[7]); + s6[1] = vaddq_s16(s5[1], s5[6]); + s6[2] = vaddq_s16(s5[2], s5[5]); + s6[3] = vaddq_s16(s5[3], s4[4]); + s6[4] = vsubq_s16(s5[3], s4[4]); + s6[5] = vsubq_s16(s5[2], s5[5]); + s6[6] = vsubq_s16(s5[1], s5[6]); + s6[7] = vsubq_s16(s5[0], s4[7]); + + s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64); + + s6[16] = vaddq_s16(s4[16], s4[23]); + s6[17] = vaddq_s16(s4[17], s4[22]); + s6[18] = vaddq_s16(s5[18], s5[21]); + s6[19] = vaddq_s16(s5[19], s5[20]); + s6[20] = vsubq_s16(s5[19], s5[20]); + s6[21] = vsubq_s16(s5[18], s5[21]); + s6[22] = vsubq_s16(s4[17], s4[22]); + s6[23] = vsubq_s16(s4[16], s4[23]); + s6[24] = vsubq_s16(s4[31], s4[24]); + s6[25] = vsubq_s16(s4[30], s4[25]); + s6[26] = vsubq_s16(s5[29], s5[26]); + s6[27] = vsubq_s16(s5[28], s5[27]); + s6[28] = vaddq_s16(s5[27], s5[28]); + s6[29] = vaddq_s16(s5[26], s5[29]); + s6[30] = vaddq_s16(s4[25], s4[30]); + s6[31] = vaddq_s16(s4[24], s4[31]); + + // stage 7 + s7[0] = vaddq_s16(s6[0], s5[15]); + s7[1] = vaddq_s16(s6[1], s5[14]); + s7[2] = vaddq_s16(s6[2], s6[13]); + s7[3] = vaddq_s16(s6[3], s6[12]); + s7[4] = vaddq_s16(s6[4], s6[11]); + s7[5] = vaddq_s16(s6[5], s6[10]); + s7[6] = vaddq_s16(s6[6], s5[9]); + s7[7] = vaddq_s16(s6[7], s5[8]); + s7[8] = vsubq_s16(s6[7], s5[8]); + s7[9] = vsubq_s16(s6[6], s5[9]); + s7[10] = vsubq_s16(s6[5], s6[10]); + s7[11] = vsubq_s16(s6[4], s6[11]); + s7[12] = vsubq_s16(s6[3], s6[12]); + s7[13] = vsubq_s16(s6[2], s6[13]); + s7[14] = vsubq_s16(s6[1], s5[14]); + s7[15] = vsubq_s16(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64); + + // final stage + out[0] = final_add(s7[0], s6[31]); + out[1] = final_add(s7[1], s6[30]); + out[2] = final_add(s7[2], s6[29]); + out[3] = final_add(s7[3], s6[28]); + out[4] = final_add(s7[4], s7[27]); + out[5] = final_add(s7[5], s7[26]); + out[6] = final_add(s7[6], s7[25]); + out[7] = final_add(s7[7], s7[24]); + out[8] = final_add(s7[8], s7[23]); + out[9] = final_add(s7[9], s7[22]); + out[10] = final_add(s7[10], s7[21]); + out[11] = final_add(s7[11], s7[20]); + out[12] = final_add(s7[12], s6[19]); + out[13] = final_add(s7[13], s6[18]); + out[14] = final_add(s7[14], s6[17]); + out[15] = final_add(s7[15], s6[16]); + out[16] = final_sub(s7[15], s6[16]); + out[17] = final_sub(s7[14], s6[17]); + out[18] = final_sub(s7[13], s6[18]); + out[19] = final_sub(s7[12], s6[19]); + out[20] = final_sub(s7[11], s7[20]); + out[21] = final_sub(s7[10], s7[21]); + out[22] = final_sub(s7[9], s7[22]); + out[23] = final_sub(s7[8], s7[23]); + out[24] = final_sub(s7[7], s7[24]); + out[25] = final_sub(s7[6], s7[25]); + out[26] = final_sub(s7[5], s7[26]); + out[27] = final_sub(s7[4], s7[27]); + out[28] = final_sub(s7[3], s6[28]); + out[29] = final_sub(s7[2], s6[29]); + out[30] = final_sub(s7[1], s6[30]); + out[31] = final_sub(s7[0], s6[31]); + + if (highbd_flag) { + highbd_add_and_store_bd8(out, output, stride); + } else { + uint8_t *const outputT = (uint8_t *)output; + add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], + out[7], outputT, stride); + add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], + out[14], out[15], outputT + (8 * stride), stride); + add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], + out[22], out[23], outputT + (16 * stride), stride); + add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], + out[30], out[31], outputT + (24 * stride), stride); + } +} + +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + int16_t temp[32 * 16]; + int16_t *t = temp; + + vpx_idct32_12_neon(input, temp); + vpx_idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_16_neon(t, dest, stride, 0); + t += (16 * 8); + dest += 8; + } +} diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm deleted file mode 100644 index 96d276b4d1..0000000000 --- a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm +++ /dev/null @@ -1,144 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - EXPORT |vpx_idct32x32_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ;TODO(hkuang): put the following macros in a seperate - ;file so other idct function could also use them. - MACRO - LD_16x8 $src, $stride - vld1.8 {q8}, [$src], $stride - vld1.8 {q9}, [$src], $stride - vld1.8 {q10}, [$src], $stride - vld1.8 {q11}, [$src], $stride - vld1.8 {q12}, [$src], $stride - vld1.8 {q13}, [$src], $stride - vld1.8 {q14}, [$src], $stride - vld1.8 {q15}, [$src], $stride - MEND - - MACRO - ADD_DIFF_16x8 $diff - vqadd.u8 q8, q8, $diff - vqadd.u8 q9, q9, $diff - vqadd.u8 q10, q10, $diff - vqadd.u8 q11, q11, $diff - vqadd.u8 q12, q12, $diff - vqadd.u8 q13, q13, $diff - vqadd.u8 q14, q14, $diff - vqadd.u8 q15, q15, $diff - MEND - - MACRO - SUB_DIFF_16x8 $diff - vqsub.u8 q8, q8, $diff - vqsub.u8 q9, q9, $diff - vqsub.u8 q10, q10, $diff - vqsub.u8 q11, q11, $diff - vqsub.u8 q12, q12, $diff - vqsub.u8 q13, q13, $diff - vqsub.u8 q14, q14, $diff - vqsub.u8 q15, q15, $diff - MEND - - MACRO - ST_16x8 $dst, $stride - vst1.8 {q8}, [$dst], $stride - vst1.8 {q9}, [$dst], $stride - vst1.8 {q10},[$dst], $stride - vst1.8 {q11},[$dst], $stride - vst1.8 {q12},[$dst], $stride - vst1.8 {q13},[$dst], $stride - vst1.8 {q14},[$dst], $stride - vst1.8 {q15},[$dst], $stride - MEND - -;void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride - -|vpx_idct32x32_1_add_neon| PROC - push {lr} - pld [r1] - add r3, r1, #16 ; r3 dest + 16 for second loop - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asrs r0, r0, #6 ; >> 6 - bge diff_positive_32_32 - -diff_negative_32_32 - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_negative_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_negative_32_32_loop - pop {pc} - -diff_positive_32_32 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_positive_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_positive_32_32_loop - pop {pc} - - ENDP ; |vpx_idct32x32_1_add_neon| - END diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c index dab7d098e8..8920b93363 100644 --- a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -10,131 +10,49 @@ #include -#include "./vpx_config.h" - +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vld1q_u8(d); - d += d_stride; - *q9u8 = vld1q_u8(d); - d += d_stride; - *q10u8 = vld1q_u8(d); - d += d_stride; - *q11u8 = vld1q_u8(d); - d += d_stride; - *q12u8 = vld1q_u8(d); - d += d_stride; - *q13u8 = vld1q_u8(d); - d += d_stride; - *q14u8 = vld1q_u8(d); - d += d_stride; - *q15u8 = vld1q_u8(d); - return; +static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a0 = vld1q_u8(*dest); + const uint8x16_t a1 = vld1q_u8(*dest + 16); + const uint8x16_t b0 = vqaddq_u8(a0, res); + const uint8x16_t b1 = vqaddq_u8(a1, res); + vst1q_u8(*dest, b0); + vst1q_u8(*dest + 16, b1); + *dest += stride; } -static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqaddq_u8(*q8u8, qdiffu8); - *q9u8 = vqaddq_u8(*q9u8, qdiffu8); - *q10u8 = vqaddq_u8(*q10u8, qdiffu8); - *q11u8 = vqaddq_u8(*q11u8, qdiffu8); - *q12u8 = vqaddq_u8(*q12u8, qdiffu8); - *q13u8 = vqaddq_u8(*q13u8, qdiffu8); - *q14u8 = vqaddq_u8(*q14u8, qdiffu8); - *q15u8 = vqaddq_u8(*q15u8, qdiffu8); - return; +static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a0 = vld1q_u8(*dest); + const uint8x16_t a1 = vld1q_u8(*dest + 16); + const uint8x16_t b0 = vqsubq_u8(a0, res); + const uint8x16_t b1 = vqsubq_u8(a1, res); + vst1q_u8(*dest, b0); + vst1q_u8(*dest + 16, b1); + *dest += stride; } -static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqsubq_u8(*q8u8, qdiffu8); - *q9u8 = vqsubq_u8(*q9u8, qdiffu8); - *q10u8 = vqsubq_u8(*q10u8, qdiffu8); - *q11u8 = vqsubq_u8(*q11u8, qdiffu8); - *q12u8 = vqsubq_u8(*q12u8, qdiffu8); - *q13u8 = vqsubq_u8(*q13u8, qdiffu8); - *q14u8 = vqsubq_u8(*q14u8, qdiffu8); - *q15u8 = vqsubq_u8(*q15u8, qdiffu8); - return; -} +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); -static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - vst1q_u8(d, *q8u8); - d += d_stride; - vst1q_u8(d, *q9u8); - d += d_stride; - vst1q_u8(d, *q10u8); - d += d_stride; - vst1q_u8(d, *q11u8); - d += d_stride; - vst1q_u8(d, *q12u8); - d += d_stride; - vst1q_u8(d, *q13u8); - d += d_stride; - vst1q_u8(d, *q14u8); - d += d_stride; - vst1q_u8(d, *q15u8); - return; -} - -void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int i, j, dest_stride8; - uint8_t *d; - int16_t a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - dest_stride8 = dest_stride * 8; - if (a1 >= 0) { // diff_positive_32_32 - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - d += dest_stride8; - } + if (a1 >= 0) { + const uint8x16_t dc = create_dcq(a1); + for (i = 0; i < 32; i++) { + idct32x32_1_add_pos_kernel(&dest, stride, dc); } - } else { // diff_negative_32_32 - a1 = -a1; - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - d += dest_stride8; - } + } else { + const uint8x16_t dc = create_dcq(-a1); + for (i = 0; i < 32; i++) { + idct32x32_1_add_neg_kernel(&dest, stride, dc); } } - return; } diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c new file mode 100644 index 0000000000..f3c336fa31 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +// Only for the first pass of the _34_ variant. Since it only uses values from +// the top left 8x8 it can safely assume all the remaining values are 0 and skip +// an awful lot of calculations. In fact, only the first 6 columns make the cut. +// None of the elements in the 7th or 8th column are used so it skips any calls +// to input[67] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 8x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 +// 0 0 2 5 10 17 25 +// 1 1 4 8 15 22 30 +// 2 3 7 12 18 28 +// 3 6 11 16 23 31 +// 4 9 14 19 29 +// 5 13 20 26 +// 6 21 27 33 +// 7 24 32 +void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) { + int16x8_t in[8], s1[32], s2[32], s3[32]; + + in[0] = load_tran_low_to_s16q(input); + input += 32; + in[1] = load_tran_low_to_s16q(input); + input += 32; + in[2] = load_tran_low_to_s16q(input); + input += 32; + in[3] = load_tran_low_to_s16q(input); + input += 32; + in[4] = load_tran_low_to_s16q(input); + input += 32; + in[5] = load_tran_low_to_s16q(input); + input += 32; + in[6] = load_tran_low_to_s16q(input); + input += 32; + in[7] = load_tran_low_to_s16q(input); + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); + + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27], + cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], + cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); + + s2[20] = vsubq_s16(s1[23], s1[20]); + s2[21] = vsubq_s16(s1[22], s1[21]); + s2[22] = vaddq_s16(s1[21], s1[22]); + s2[23] = vaddq_s16(s1[20], s1[23]); + s2[24] = vaddq_s16(s1[24], s1[27]); + s2[25] = vaddq_s16(s1[25], s1[26]); + s2[26] = vsubq_s16(s1[25], s1[26]); + s2[27] = vsubq_s16(s1[24], s1[27]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64); + + s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30], + cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30], + cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31], + cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31], + cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27], + cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26], + cospi_24_64); + + // stage 6 + s2[0] = vaddq_s16(s1[0], s1[7]); + s2[1] = vaddq_s16(s1[0], s1[6]); + s2[2] = vaddq_s16(s1[0], s1[5]); + s2[3] = vaddq_s16(s1[0], s1[4]); + s2[4] = vsubq_s16(s1[0], s1[4]); + s2[5] = vsubq_s16(s1[0], s1[5]); + s2[6] = vsubq_s16(s1[0], s1[6]); + s2[7] = vsubq_s16(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64); + + s2[16] = vaddq_s16(s1[16], s2[23]); + s2[17] = vaddq_s16(s1[17], s2[22]); + s2[18] = vaddq_s16(s1[18], s1[21]); + s2[19] = vaddq_s16(s1[19], s1[20]); + s2[20] = vsubq_s16(s1[19], s1[20]); + s2[21] = vsubq_s16(s1[18], s1[21]); + s2[22] = vsubq_s16(s1[17], s2[22]); + s2[23] = vsubq_s16(s1[16], s2[23]); + + s3[24] = vsubq_s16(s1[31], s2[24]); + s3[25] = vsubq_s16(s1[30], s2[25]); + s3[26] = vsubq_s16(s1[29], s1[26]); + s3[27] = vsubq_s16(s1[28], s1[27]); + s2[28] = vaddq_s16(s1[27], s1[28]); + s2[29] = vaddq_s16(s1[26], s1[29]); + s2[30] = vaddq_s16(s2[25], s1[30]); + s2[31] = vaddq_s16(s2[24], s1[31]); + + // stage 7 + s1[0] = vaddq_s16(s2[0], s2[15]); + s1[1] = vaddq_s16(s2[1], s2[14]); + s1[2] = vaddq_s16(s2[2], s2[13]); + s1[3] = vaddq_s16(s2[3], s2[12]); + s1[4] = vaddq_s16(s2[4], s2[11]); + s1[5] = vaddq_s16(s2[5], s2[10]); + s1[6] = vaddq_s16(s2[6], s2[9]); + s1[7] = vaddq_s16(s2[7], s2[8]); + s1[8] = vsubq_s16(s2[7], s2[8]); + s1[9] = vsubq_s16(s2[6], s2[9]); + s1[10] = vsubq_s16(s2[5], s2[10]); + s1[11] = vsubq_s16(s2[4], s2[11]); + s1[12] = vsubq_s16(s2[3], s2[12]); + s1[13] = vsubq_s16(s2[2], s2[13]); + s1[14] = vsubq_s16(s2[1], s2[14]); + s1[15] = vsubq_s16(s2[0], s2[15]); + + s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64); + + s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64); + + s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64); + + // final stage + vst1q_s16(output, vaddq_s16(s1[0], s2[31])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[1], s2[30])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[2], s2[29])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[3], s2[28])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[4], s1[27])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[5], s1[26])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[6], s1[25])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[7], s1[24])); + output += 8; + + vst1q_s16(output, vaddq_s16(s1[8], s1[23])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[9], s1[22])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[10], s1[21])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[11], s1[20])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[12], s2[19])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[13], s2[18])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[14], s2[17])); + output += 8; + vst1q_s16(output, vaddq_s16(s1[15], s2[16])); + output += 8; + + vst1q_s16(output, vsubq_s16(s1[15], s2[16])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[14], s2[17])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[13], s2[18])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[12], s2[19])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[11], s1[20])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[10], s1[21])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[9], s1[22])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[8], s1[23])); + output += 8; + + vst1q_s16(output, vsubq_s16(s1[7], s1[24])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[6], s1[25])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[5], s1[26])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[4], s1[27])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[3], s2[28])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[2], s2[29])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[1], s2[30])); + output += 8; + vst1q_s16(output, vsubq_s16(s1[0], s2[31])); +} + +void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, + const int highbd_flag) { + int16x8_t in[8], s1[32], s2[32], s3[32], out[32]; + + load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); + + // Different for _8_ + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); + + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); + + // Different for _8_ + s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64, + s1[28], -cospi_4_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28], + cospi_28_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27], + cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], + cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); + + s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64, + s2[12], -cospi_8_64); + s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12], + cospi_24_64); + + s2[16] = vaddq_s16(s1[16], s1[19]); + + s2[17] = vaddq_s16(s1[17], s1[18]); + s2[18] = vsubq_s16(s1[17], s1[18]); + + s2[19] = vsubq_s16(s1[16], s1[19]); + + s2[20] = vsubq_s16(s1[23], s1[20]); + s2[21] = vsubq_s16(s1[22], s1[21]); + + s2[22] = vaddq_s16(s1[21], s1[22]); + s2[23] = vaddq_s16(s1[20], s1[23]); + + s2[24] = vaddq_s16(s1[24], s1[27]); + s2[25] = vaddq_s16(s1[25], s1[26]); + s2[26] = vsubq_s16(s1[25], s1[26]); + s2[27] = vsubq_s16(s1[24], s1[27]); + + s2[28] = vsubq_s16(s1[31], s1[28]); + s2[29] = vsubq_s16(s1[30], s1[29]); + s2[30] = vaddq_s16(s1[29], s1[30]); + s2[31] = vaddq_s16(s1[28], s1[31]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64); + + s1[8] = vaddq_s16(s2[8], s2[11]); + s1[9] = vaddq_s16(s2[9], s2[10]); + s1[10] = vsubq_s16(s2[9], s2[10]); + s1[11] = vsubq_s16(s2[8], s2[11]); + s1[12] = vsubq_s16(s2[15], s2[12]); + s1[13] = vsubq_s16(s2[14], s2[13]); + s1[14] = vaddq_s16(s2[13], s2[14]); + s1[15] = vaddq_s16(s2[12], s2[15]); + + s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29], + cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29], + cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28], + cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28], + cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27], + cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26], + cospi_24_64); + + // stage 6 + s2[0] = vaddq_s16(s1[0], s1[7]); + s2[1] = vaddq_s16(s1[0], s1[6]); + s2[2] = vaddq_s16(s1[0], s1[5]); + s2[3] = vaddq_s16(s1[0], s1[4]); + s2[4] = vsubq_s16(s1[0], s1[4]); + s2[5] = vsubq_s16(s1[0], s1[5]); + s2[6] = vsubq_s16(s1[0], s1[6]); + s2[7] = vsubq_s16(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64); + + s1[16] = vaddq_s16(s2[16], s2[23]); + s1[17] = vaddq_s16(s2[17], s2[22]); + s2[18] = vaddq_s16(s1[18], s1[21]); + s2[19] = vaddq_s16(s1[19], s1[20]); + s2[20] = vsubq_s16(s1[19], s1[20]); + s2[21] = vsubq_s16(s1[18], s1[21]); + s1[22] = vsubq_s16(s2[17], s2[22]); + s1[23] = vsubq_s16(s2[16], s2[23]); + + s3[24] = vsubq_s16(s2[31], s2[24]); + s3[25] = vsubq_s16(s2[30], s2[25]); + s3[26] = vsubq_s16(s1[29], s1[26]); + s3[27] = vsubq_s16(s1[28], s1[27]); + s2[28] = vaddq_s16(s1[27], s1[28]); + s2[29] = vaddq_s16(s1[26], s1[29]); + s2[30] = vaddq_s16(s2[25], s2[30]); + s2[31] = vaddq_s16(s2[24], s2[31]); + + // stage 7 + s1[0] = vaddq_s16(s2[0], s1[15]); + s1[1] = vaddq_s16(s2[1], s1[14]); + s1[2] = vaddq_s16(s2[2], s2[13]); + s1[3] = vaddq_s16(s2[3], s2[12]); + s1[4] = vaddq_s16(s2[4], s2[11]); + s1[5] = vaddq_s16(s2[5], s2[10]); + s1[6] = vaddq_s16(s2[6], s1[9]); + s1[7] = vaddq_s16(s2[7], s1[8]); + s1[8] = vsubq_s16(s2[7], s1[8]); + s1[9] = vsubq_s16(s2[6], s1[9]); + s1[10] = vsubq_s16(s2[5], s2[10]); + s1[11] = vsubq_s16(s2[4], s2[11]); + s1[12] = vsubq_s16(s2[3], s2[12]); + s1[13] = vsubq_s16(s2[2], s2[13]); + s1[14] = vsubq_s16(s2[1], s1[14]); + s1[15] = vsubq_s16(s2[0], s1[15]); + + s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64); + + s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64); + + s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64); + + // final stage + out[0] = final_add(s1[0], s2[31]); + out[1] = final_add(s1[1], s2[30]); + out[2] = final_add(s1[2], s2[29]); + out[3] = final_add(s1[3], s2[28]); + out[4] = final_add(s1[4], s1[27]); + out[5] = final_add(s1[5], s1[26]); + out[6] = final_add(s1[6], s1[25]); + out[7] = final_add(s1[7], s1[24]); + out[8] = final_add(s1[8], s2[23]); + out[9] = final_add(s1[9], s2[22]); + out[10] = final_add(s1[10], s1[21]); + out[11] = final_add(s1[11], s1[20]); + out[12] = final_add(s1[12], s2[19]); + out[13] = final_add(s1[13], s2[18]); + out[14] = final_add(s1[14], s1[17]); + out[15] = final_add(s1[15], s1[16]); + out[16] = final_sub(s1[15], s1[16]); + out[17] = final_sub(s1[14], s1[17]); + out[18] = final_sub(s1[13], s2[18]); + out[19] = final_sub(s1[12], s2[19]); + out[20] = final_sub(s1[11], s1[20]); + out[21] = final_sub(s1[10], s1[21]); + out[22] = final_sub(s1[9], s2[22]); + out[23] = final_sub(s1[8], s2[23]); + out[24] = final_sub(s1[7], s1[24]); + out[25] = final_sub(s1[6], s1[25]); + out[26] = final_sub(s1[5], s1[26]); + out[27] = final_sub(s1[4], s1[27]); + out[28] = final_sub(s1[3], s2[28]); + out[29] = final_sub(s1[2], s2[29]); + out[30] = final_sub(s1[1], s2[30]); + out[31] = final_sub(s1[0], s2[31]); + + if (highbd_flag) { + highbd_add_and_store_bd8(out, output, stride); + } else { + uint8_t *const outputT = (uint8_t *)output; + add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], + out[7], outputT, stride); + add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], + out[14], out[15], outputT + (8 * stride), stride); + add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], + out[22], out[23], outputT + (16 * stride), stride); + add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], + out[30], out[31], outputT + (24 * stride), stride); + } +} + +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + int16_t temp[32 * 8]; + int16_t *t = temp; + + vpx_idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_8_neon(t, dest, stride, 0); + t += (8 * 8); + dest += 8; + } +} diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm deleted file mode 100644 index 7483ee77e1..0000000000 --- a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm +++ /dev/null @@ -1,1299 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -;TODO(cd): adjust these constant to be able to use vqdmulh for faster -; dct_const_round_shift(a * b) within butterfly calculations. -cospi_1_64 EQU 16364 -cospi_2_64 EQU 16305 -cospi_3_64 EQU 16207 -cospi_4_64 EQU 16069 -cospi_5_64 EQU 15893 -cospi_6_64 EQU 15679 -cospi_7_64 EQU 15426 -cospi_8_64 EQU 15137 -cospi_9_64 EQU 14811 -cospi_10_64 EQU 14449 -cospi_11_64 EQU 14053 -cospi_12_64 EQU 13623 -cospi_13_64 EQU 13160 -cospi_14_64 EQU 12665 -cospi_15_64 EQU 12140 -cospi_16_64 EQU 11585 -cospi_17_64 EQU 11003 -cospi_18_64 EQU 10394 -cospi_19_64 EQU 9760 -cospi_20_64 EQU 9102 -cospi_21_64 EQU 8423 -cospi_22_64 EQU 7723 -cospi_23_64 EQU 7005 -cospi_24_64 EQU 6270 -cospi_25_64 EQU 5520 -cospi_26_64 EQU 4756 -cospi_27_64 EQU 3981 -cospi_28_64 EQU 3196 -cospi_29_64 EQU 2404 -cospi_30_64 EQU 1606 -cospi_31_64 EQU 804 - - - EXPORT |vpx_idct32x32_1024_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY - - ; -------------------------------------------------------------------------- - ; Load from transposed_buffer - ; q13 = transposed_buffer[first_offset] - ; q14 = transposed_buffer[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; transposed_buffer must be passed in. use 0 for first use. - MACRO - LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset - ; address calculation with proper stride and loading - add r0, #($first_offset - $prev_offset )*8*2 - vld1.s16 {q14}, [r0] - add r0, #($second_offset - $first_offset)*8*2 - vld1.s16 {q13}, [r0] - ; (used) two registers (q14, q13) - MEND - ; -------------------------------------------------------------------------- - ; Load from output (used as temporary storage) - ; reg1 = output[first_offset] - ; reg2 = output[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and loading - add r1, #($first_offset - $prev_offset )*32*2 - vld1.s16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vld1.s16 {$reg2}, [r1] - ; (used) two registers ($reg1, $reg2) - MEND - ; -------------------------------------------------------------------------- - ; Store into output (sometimes as as temporary storage) - ; output[first_offset] = reg1 - ; output[second_offset] = reg2 - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and storing - add r1, #($first_offset - $prev_offset )*32*2 - vst1.16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vst1.16 {$reg2}, [r1] - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10] - vst1.16 {d11}, [r9] - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10]! - vst1.16 {d11}, [r9]! - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6] - vst1.16 {d4}, [r7] - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6]! - vst1.16 {d4}, [r7]! - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - ; TODO(cd): have special case to re-use constants when they are similar for - ; consecutive butterflies - ; TODO(cd): have special case when both constants are the same, do the - ; additions/subtractions before the multiplies. - ; generate the constants - ; generate scalar constants - mov r8, #$first_constant & 0xFF00 - mov r12, #$second_constant & 0xFF00 - add r8, #$first_constant & 0x00FF - add r12, #$second_constant & 0x00FF - ; generate vector constants - vdup.16 d30, r8 - vdup.16 d31, r12 - ; (used) two for inputs (regA-regD), one for constants (q15) - ; do some multiplications (ordered for maximum latency hiding) - vmull.s16 q8, $regC, d30 - vmull.s16 q10, $regA, d31 - vmull.s16 q9, $regD, d30 - vmull.s16 q11, $regB, d31 - vmull.s16 q12, $regC, d31 - ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/subtractions (to get back two register) - vsub.s32 q8, q8, q10 - vsub.s32 q9, q9, q11 - ; do more multiplications (ordered for maximum latency hiding) - vmull.s16 q10, $regD, d31 - vmull.s16 q11, $regA, d30 - vmull.s16 q15, $regB, d30 - ; (used) six for intermediate (q8-q12, q15) - ; do more addition/subtractions - vadd.s32 q11, q12, q11 - vadd.s32 q10, q10, q15 - ; (used) four for intermediate (q8-q11) - ; dct_const_round_shift - vqrshrn.s32 $reg1, q8, #14 - vqrshrn.s32 $reg2, q9, #14 - vqrshrn.s32 $reg3, q11, #14 - vqrshrn.s32 $reg4, q10, #14 - ; (used) two for results, well four d registers - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - MEND - ; -------------------------------------------------------------------------- - -;void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); -; -; r0 int16_t *input, -; r1 uint8_t *dest, -; r2 int dest_stride) -; loop counters -; r4 bands loop counter -; r5 pass loop counter -; r8 transpose loop counter -; combine-add pointers -; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) -; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) -; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) -; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) - -|vpx_idct32x32_1024_add_neon| PROC - ; This function does one pass of idct32x32 transform. - ; - ; This is done by transposing the input and then doing a 1d transform on - ; columns. In the first pass, the transposed columns are the original - ; rows. In the second pass, after the transposition, the colums are the - ; original columns. - ; The 1d transform is done by looping over bands of eight columns (the - ; idct32_bands loop). For each band, the transform input transposition - ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are transposed by pairs (the idct32_transpose_pair loop). - push {r4-r11} - vpush {d8-d15} - ; stack operation - ; internal buffer used to transpose 8 lines into before transforming them - ; int16_t transpose_buffer[32 * 8]; - ; at sp + [4096, 4607] - ; results of the first pass (transpose and transform rows) - ; int16_t pass1[32 * 32]; - ; at sp + [0, 2047] - ; results of the second pass (transpose and transform columns) - ; int16_t pass2[32 * 32]; - ; at sp + [2048, 4095] - sub sp, sp, #512+2048+2048 - - ; r6 = dest + 31 * dest_stride - ; r7 = dest + 0 * dest_stride - ; r9 = dest + 15 * dest_stride - ; r10 = dest + 16 * dest_stride - rsb r6, r2, r2, lsl #5 - rsb r9, r2, r2, lsl #4 - add r10, r1, r2, lsl #4 - mov r7, r1 - add r6, r6, r1 - add r9, r9, r1 - ; r11 = -dest_stride - neg r11, r2 - ; r3 = input - mov r3, r0 - ; parameters for first pass - ; r0 = transpose_buffer[32 * 8] - add r0, sp, #4096 - ; r1 = pass1[32 * 32] - mov r1, sp - - mov r5, #0 ; initialize pass loop counter -idct32_pass_loop - mov r4, #4 ; initialize bands loop counter -idct32_bands_loop - mov r8, #2 ; initialize transpose loop counter -idct32_transpose_pair_loop - ; Load two horizontally consecutive 8x8 16bit data matrices. The first one - ; into q0-q7 and the second one into q8-q15. There is a stride of 64, - ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r3]! - vld1.s16 {q0}, [r3]! - add r3, #32 - vld1.s16 {q9}, [r3]! - vld1.s16 {q1}, [r3]! - add r3, #32 - vld1.s16 {q10}, [r3]! - vld1.s16 {q2}, [r3]! - add r3, #32 - vld1.s16 {q11}, [r3]! - vld1.s16 {q3}, [r3]! - add r3, #32 - vld1.s16 {q12}, [r3]! - vld1.s16 {q4}, [r3]! - add r3, #32 - vld1.s16 {q13}, [r3]! - vld1.s16 {q5}, [r3]! - add r3, #32 - vld1.s16 {q14}, [r3]! - vld1.s16 {q6}, [r3]! - add r3, #32 - vld1.s16 {q15}, [r3]! - vld1.s16 {q7}, [r3]! - - ; Transpose the two 8x8 16bit data matrices. - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vswp d1, d8 - vswp d7, d14 - vswp d5, d12 - vswp d3, d10 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; Store both matrices after each other. There is a stride of 32, which - ; adjusts to nothing because of the post-increments. - vst1.16 {q8}, [r0]! - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! - vst1.16 {q0}, [r0]! - vst1.16 {q1}, [r0]! - vst1.16 {q2}, [r0]! - vst1.16 {q3}, [r0]! - vst1.16 {q4}, [r0]! - vst1.16 {q5}, [r0]! - vst1.16 {q6}, [r0]! - vst1.16 {q7}, [r0]! - - ; increment pointers by adjusted stride (not necessary for r0/out) - ; go back by 7*32 for the seven lines moved fully by read and add - ; go back by 32 for the eigth line only read - ; advance by 16*2 to go the next pair - sub r3, r3, #7*32*2 + 32 - 16*2 - ; transpose pair loop processing - subs r8, r8, #1 - bne idct32_transpose_pair_loop - - ; restore r0/input to its original value - sub r0, r0, #32*8*2 - - ; Instead of doing the transforms stage by stage, it is done by loading - ; some input values and doing as many stages as possible to minimize the - ; storing/loading of intermediate results. To fit within registers, the - ; final coefficients are cut into four blocks: - ; BLOCK A: 16-19,28-31 - ; BLOCK B: 20-23,24-27 - ; BLOCK C: 8-10,11-15 - ; BLOCK D: 0-3,4-7 - ; Blocks A and C are straight calculation through the various stages. In - ; block B, further calculations are performed using the results from - ; block A. In block D, further calculations are performed using the results - ; from block C and then the final calculations are done using results from - ; block A and B which have been combined at the end of block B. - - ; -------------------------------------------------------------------------- - ; BLOCK A: 16-19,28-31 - ; -------------------------------------------------------------------------- - ; generate 16,17,30,31 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; - ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; - ;step1b[16][i] = dct_const_round_shift(temp1); - ;step1b[31][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 0, 1, 31 - DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; - ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; - ;step1b[17][i] = dct_const_round_shift(temp1); - ;step1b[30][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 31, 17, 15 - DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[16] = step1b[16][i] + step1b[17][i]; - ;step2[17] = step1b[16][i] - step1b[17][i]; - ;step2[30] = -step1b[30][i] + step1b[31][i]; - ;step2[31] = step1b[30][i] + step1b[31][i]; - vadd.s16 q4, q0, q1 - vsub.s16 q13, q0, q1 - vadd.s16 q6, q2, q3 - vsub.s16 q14, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; - ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; - ;step3[17] = dct_const_round_shift(temp1); - ;step3[30] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; generate 18,19,28,29 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; - ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; - ;step1b[18][i] = dct_const_round_shift(temp1); - ;step1b[29][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 15, 9, 23 - DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; - ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; - ;step1b[19][i] = dct_const_round_shift(temp1); - ;step1b[28][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 23, 25, 7 - DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[18] = -step1b[18][i] + step1b[19][i]; - ;step2[19] = step1b[18][i] + step1b[19][i]; - ;step2[28] = step1b[28][i] + step1b[29][i]; - ;step2[29] = step1b[28][i] - step1b[29][i]; - vsub.s16 q13, q3, q2 - vadd.s16 q3, q3, q2 - vsub.s16 q14, q1, q0 - vadd.s16 q2, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); - ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); - ;step3[29] = dct_const_round_shift(temp1); - ;step3[18] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 - ; -------------------------------------------------------------------------- - ; combine 16-19,28-31 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[16] = step1b[16][i] + step1b[19][i]; - ;step1[17] = step1b[17][i] + step1b[18][i]; - ;step1[18] = step1b[17][i] - step1b[18][i]; - ;step1[29] = step1b[30][i] - step1b[29][i]; - ;step1[30] = step1b[30][i] + step1b[29][i]; - ;step1[31] = step1b[31][i] + step1b[28][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q0 - vadd.s16 q10, q7, q1 - vadd.s16 q15, q6, q3 - vsub.s16 q13, q5, q0 - vsub.s16 q14, q7, q1 - STORE_IN_OUTPUT 0, 16, 31, q8, q15 - STORE_IN_OUTPUT 31, 17, 30, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; - ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; - ;step2[18] = dct_const_round_shift(temp1); - ;step2[29] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 - STORE_IN_OUTPUT 30, 29, 18, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[19] = step1b[16][i] - step1b[19][i]; - ;step1[28] = step1b[31][i] - step1b[28][i]; - vsub.s16 q13, q4, q2 - vsub.s16 q14, q6, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; - ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; - ;step2[19] = dct_const_round_shift(temp1); - ;step2[28] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 - STORE_IN_OUTPUT 18, 19, 28, q4, q6 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK B: 20-23,24-27 - ; -------------------------------------------------------------------------- - ; generate 20,21,26,27 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; - ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; - ;step1b[20][i] = dct_const_round_shift(temp1); - ;step1b[27][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 7, 5, 27 - DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; - ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; - ;step1b[21][i] = dct_const_round_shift(temp1); - ;step1b[26][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 27, 21, 11 - DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[20] = step1b[20][i] + step1b[21][i]; - ;step2[21] = step1b[20][i] - step1b[21][i]; - ;step2[26] = -step1b[26][i] + step1b[27][i]; - ;step2[27] = step1b[26][i] + step1b[27][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; - ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; - ;step3[21] = dct_const_round_shift(temp1); - ;step3[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 22,23,24,25 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; - ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; - ;step1b[22][i] = dct_const_round_shift(temp1); - ;step1b[25][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 11, 13, 19 - DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; - ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; - ;step1b[23][i] = dct_const_round_shift(temp1); - ;step1b[24][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 19, 29, 3 - DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[22] = -step1b[22][i] + step1b[23][i]; - ;step2[23] = step1b[22][i] + step1b[23][i]; - ;step2[24] = step1b[24][i] + step1b[25][i]; - ;step2[25] = step1b[24][i] - step1b[25][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); - ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); - ;step3[25] = dct_const_round_shift(temp1); - ;step3[22] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 20-23,24-27 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[22] = step1b[22][i] + step1b[21][i]; - ;step1[23] = step1b[23][i] + step1b[20][i]; - vadd.s16 q10, q7, q1 - vadd.s16 q11, q5, q0 - ;step1[24] = step1b[24][i] + step1b[27][i]; - ;step1[25] = step1b[25][i] + step1b[26][i]; - vadd.s16 q12, q6, q2 - vadd.s16 q15, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[16] = step1b[16][i] + step1b[23][i]; - ;step3[17] = step1b[17][i] + step1b[22][i]; - ;step3[22] = step1b[17][i] - step1b[22][i]; - ;step3[23] = step1b[16][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 - vadd.s16 q8, q14, q11 - vadd.s16 q9, q13, q10 - vsub.s16 q13, q13, q10 - vsub.s16 q11, q14, q11 - STORE_IN_OUTPUT 17, 17, 16, q9, q8 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[24] = step1b[31][i] - step1b[24][i]; - ;step3[25] = step1b[30][i] - step1b[25][i]; - ;step3[30] = step1b[30][i] + step1b[25][i]; - ;step3[31] = step1b[31][i] + step1b[24][i]; - LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 - vsub.s16 q8, q9, q12 - vadd.s16 q10, q14, q15 - vsub.s16 q14, q14, q15 - vadd.s16 q12, q9, q12 - STORE_IN_OUTPUT 31, 30, 31, q10, q12 - ; -------------------------------------------------------------------------- - ; TODO(cd) do some register allocation change to remove these push/pop - vpush {q8} ; [24] - vpush {q11} ; [23] - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; - ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; - ;step1[22] = dct_const_round_shift(temp1); - ;step1[25] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 31, 25, 22, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; - ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; - ;step1[23] = dct_const_round_shift(temp1); - ;step1[24] = dct_const_round_shift(temp2); - ; TODO(cd) do some register allocation change to remove these push/pop - vpop {q13} ; [23] - vpop {q14} ; [24] - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 22, 24, 23, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[20] = step1b[23][i] - step1b[20][i]; - ;step1[27] = step1b[24][i] - step1b[27][i]; - vsub.s16 q14, q5, q0 - vsub.s16 q13, q6, q2 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); - ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); - ;step2[27] = dct_const_round_shift(temp1); - ;step2[20] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[21] = step1b[22][i] - step1b[21][i]; - ;step1[26] = step1b[25][i] - step1b[26][i]; - vsub.s16 q14, q7, q1 - vsub.s16 q13, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); - ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); - ;step2[26] = dct_const_round_shift(temp1); - ;step2[21] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[18] = step1b[18][i] + step1b[21][i]; - ;step3[19] = step1b[19][i] + step1b[20][i]; - ;step3[20] = step1b[19][i] - step1b[20][i]; - ;step3[21] = step1b[18][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 - vadd.s16 q8, q14, q1 - vadd.s16 q9, q13, q6 - vsub.s16 q13, q13, q6 - vsub.s16 q1, q14, q1 - STORE_IN_OUTPUT 19, 18, 19, q8, q9 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[27] = step1b[28][i] - step1b[27][i]; - ;step3[28] = step1b[28][i] + step1b[27][i]; - ;step3[29] = step1b[29][i] + step1b[26][i]; - ;step3[26] = step1b[29][i] - step1b[26][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 - vsub.s16 q14, q8, q5 - vadd.s16 q10, q8, q5 - vadd.s16 q11, q9, q0 - vsub.s16 q0, q9, q0 - STORE_IN_OUTPUT 29, 28, 29, q10, q11 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; - ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; - ;step1[20] = dct_const_round_shift(temp1); - ;step1[27] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 29, 20, 27, q13, q14 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; - ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; - ;step1[21] = dct_const_round_shift(temp1); - ;step1[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 - STORE_IN_OUTPUT 27, 21, 26, q1, q0 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK C: 8-10,11-15 - ; -------------------------------------------------------------------------- - ; generate 8,9,14,15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; - ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; - ;step2[8] = dct_const_round_shift(temp1); - ;step2[15] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 3, 2, 30 - DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; - ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; - ;step2[9] = dct_const_round_shift(temp1); - ;step2[14] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 30, 18, 14 - DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[8] = step1b[8][i] + step1b[9][i]; - ;step3[9] = step1b[8][i] - step1b[9][i]; - ;step3[14] = step1b[15][i] - step1b[14][i]; - ;step3[15] = step1b[15][i] + step1b[14][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; - ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; - ;step1[9] = dct_const_round_shift(temp1); - ;step1[14] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 10,11,12,13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; - ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; - ;step2[10] = dct_const_round_shift(temp1); - ;step2[13] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 14, 10, 22 - DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; - ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; - ;step2[11] = dct_const_round_shift(temp1); - ;step2[12] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 22, 26, 6 - DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[10] = step1b[11][i] - step1b[10][i]; - ;step3[11] = step1b[11][i] + step1b[10][i]; - ;step3[12] = step1b[12][i] + step1b[13][i]; - ;step3[13] = step1b[12][i] - step1b[13][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); - ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); - ;step1[13] = dct_const_round_shift(temp1); - ;step1[10] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 8-10,11-15 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[8] = step1b[8][i] + step1b[11][i]; - ;step2[9] = step1b[9][i] + step1b[10][i]; - ;step2[10] = step1b[9][i] - step1b[10][i]; - vadd.s16 q8, q0, q5 - vadd.s16 q9, q1, q7 - vsub.s16 q13, q1, q7 - ;step2[13] = step1b[14][i] - step1b[13][i]; - ;step2[14] = step1b[14][i] + step1b[13][i]; - ;step2[15] = step1b[15][i] + step1b[12][i]; - vsub.s16 q14, q3, q4 - vadd.s16 q10, q3, q4 - vadd.s16 q15, q2, q6 - STORE_IN_OUTPUT 26, 8, 15, q8, q15 - STORE_IN_OUTPUT 15, 9, 14, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; - ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; - ;step3[10] = dct_const_round_shift(temp1); - ;step3[13] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 14, 13, 10, q3, q1 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[11] = step1b[8][i] - step1b[11][i]; - ;step2[12] = step1b[15][i] - step1b[12][i]; - vsub.s16 q13, q0, q5 - vsub.s16 q14, q2, q6 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; - ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; - ;step3[11] = dct_const_round_shift(temp1); - ;step3[12] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 10, 11, 12, q1, q3 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK D: 0-3,4-7 - ; -------------------------------------------------------------------------- - ; generate 4,5,6,7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; - ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; - ;step3[4] = dct_const_round_shift(temp1); - ;step3[7] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 6, 4, 28 - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; - ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; - ;step3[5] = dct_const_round_shift(temp1); - ;step3[6] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 28, 20, 12 - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[4] = step1b[4][i] + step1b[5][i]; - ;step1[5] = step1b[4][i] - step1b[5][i]; - ;step1[6] = step1b[7][i] - step1b[6][i]; - ;step1[7] = step1b[7][i] + step1b[6][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; - ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; - ;step2[5] = dct_const_round_shift(temp1); - ;step2[6] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 0,1,2,3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; - ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; - ;step1[1] = dct_const_round_shift(temp1); - ;step1[0] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 12, 0, 16 - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; - ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; - ;step1[2] = dct_const_round_shift(temp1); - ;step1[3] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 16, 8, 24 - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[0] = step1b[0][i] + step1b[3][i]; - ;step2[1] = step1b[1][i] + step1b[2][i]; - ;step2[2] = step1b[1][i] - step1b[2][i]; - ;step2[3] = step1b[0][i] - step1b[3][i]; - vadd.s16 q4, q7, q6 - vsub.s16 q7, q7, q6 - vsub.s16 q6, q5, q14 - vadd.s16 q5, q5, q14 - ; -------------------------------------------------------------------------- - ; combine 0-3,4-7 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[0] = step1b[0][i] + step1b[7][i]; - ;step3[1] = step1b[1][i] + step1b[6][i]; - ;step3[2] = step1b[2][i] + step1b[5][i]; - ;step3[3] = step1b[3][i] + step1b[4][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q3 - vadd.s16 q10, q6, q1 - vadd.s16 q11, q7, q0 - ;step3[4] = step1b[3][i] - step1b[4][i]; - ;step3[5] = step1b[2][i] - step1b[5][i]; - ;step3[6] = step1b[1][i] - step1b[6][i]; - ;step3[7] = step1b[0][i] - step1b[7][i]; - vsub.s16 q12, q7, q0 - vsub.s16 q13, q6, q1 - vsub.s16 q14, q5, q3 - vsub.s16 q15, q4, q2 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[0] = step1b[0][i] + step1b[15][i]; - ;step1[1] = step1b[1][i] + step1b[14][i]; - ;step1[14] = step1b[1][i] - step1b[14][i]; - ;step1[15] = step1b[0][i] - step1b[15][i]; - LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 - vadd.s16 q2, q8, q1 - vadd.s16 q3, q9, q0 - vsub.s16 q4, q9, q0 - vsub.s16 q5, q8, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[14 * 32] = step1b[14][i] + step1b[17][i]; - ;output[15 * 32] = step1b[15][i] + step1b[16][i]; - ;output[16 * 32] = step1b[15][i] - step1b[16][i]; - ;output[17 * 32] = step1b[14][i] - step1b[17][i]; - LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - - cmp r5, #0 - bgt idct32_bands_end_2nd_pass - -idct32_bands_end_1st_pass - STORE_IN_OUTPUT 17, 16, 17, q6, q7 - STORE_IN_OUTPUT 17, 14, 15, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 30, 31, q6, q7 - STORE_IN_OUTPUT 31, 0, 1, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 19, 18, 19, q6, q7 - STORE_IN_OUTPUT 19, 12, 13, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 28, 29, q6, q7 - STORE_IN_OUTPUT 29, 2, 3, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 21, 20, 21, q6, q7 - STORE_IN_OUTPUT 21, 10, 11, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 26, 27, q6, q7 - STORE_IN_OUTPUT 27, 4, 5, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 23, 22, 23, q6, q7 - STORE_IN_OUTPUT 23, 8, 9, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 24, 25, q6, q7 - STORE_IN_OUTPUT 25, 6, 7, q4, q5 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #7*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; parameters for second pass - ; the input of pass2 is the result of pass1. we have to remove the offset - ; of 32 columns induced by the above idct32_bands_loop - sub r3, r1, #32*2 - ; r1 = pass2[32 * 32] - add r1, sp, #2048 - - ; pass loop processing - add r5, r5, #1 - b idct32_pass_loop - -idct32_bands_end_2nd_pass - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; restore pointers to their initial indices for next band pass by - ; removing/adding dest_stride * 8. The actual increment by eight - ; is taken care of within the _LAST macros. - add r6, r6, r2, lsl #3 - add r9, r9, r2, lsl #3 - sub r7, r7, r2, lsl #3 - sub r10, r10, r2, lsl #3 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #25*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; stack operation - add sp, sp, #512+2048+2048 - vpop {d8-d15} - pop {r4-r11} - bx lr - ENDP ; |vpx_idct32x32_1024_add_neon| - END diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c index c4d1e8473c..9f4589ea96 100644 --- a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c @@ -11,630 +11,766 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -#define LOAD_FROM_TRANSPOSED(prev, first, second) \ - q14s16 = vld1q_s16(trans_buf + first * 8); \ - q13s16 = vld1q_s16(trans_buf + second * 8); +static INLINE void load_from_transformed(const int16_t *const trans_buf, + const int first, const int second, + int16x8_t *const q0, + int16x8_t *const q1) { + *q0 = vld1q_s16(trans_buf + first * 8); + *q1 = vld1q_s16(trans_buf + second * 8); +} -#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ - qA = vld1q_s16(out + first * 32); \ - qB = vld1q_s16(out + second * 32); +static INLINE void load_from_output(const int16_t *const out, const int first, + const int second, int16x8_t *const q0, + int16x8_t *const q1) { + *q0 = vld1q_s16(out + first * 32); + *q1 = vld1q_s16(out + second * 32); +} -#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ - vst1q_s16(out + first * 32, qA); \ - vst1q_s16(out + second * 32, qB); +static INLINE void store_in_output(int16_t *const out, const int first, + const int second, const int16x8_t q0, + const int16x8_t q1) { + vst1q_s16(out + first * 32, q0); + vst1q_s16(out + second * 32, q1); +} -#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ - __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16); -static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2, - int stride, int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16) { - int16x4_t d8s16, d9s16, d10s16, d11s16; +static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2, + const int stride, int16x8_t q0, + int16x8_t q1, int16x8_t q2, + int16x8_t q3) { + uint8x8_t d[4]; - d8s16 = vld1_s16((int16_t *)p1); + d[0] = vld1_u8(p1); p1 += stride; - d11s16 = vld1_s16((int16_t *)p2); + d[1] = vld1_u8(p1); + d[3] = vld1_u8(p2); p2 -= stride; - d9s16 = vld1_s16((int16_t *)p1); - d10s16 = vld1_s16((int16_t *)p2); + d[2] = vld1_u8(p2); - q7s16 = vrshrq_n_s16(q7s16, 6); - q8s16 = vrshrq_n_s16(q8s16, 6); - q9s16 = vrshrq_n_s16(q9s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); + q0 = vrshrq_n_s16(q0, 6); + q1 = vrshrq_n_s16(q1, 6); + q2 = vrshrq_n_s16(q2, 6); + q3 = vrshrq_n_s16(q3, 6); - q7s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16))); - q8s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16))); - q9s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16))); - q6s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16))); + q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0])); + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1])); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2])); + q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3])); - d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); - d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); - d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + d[0] = vqmovun_s16(q0); + d[1] = vqmovun_s16(q1); + d[2] = vqmovun_s16(q2); + d[3] = vqmovun_s16(q3); - vst1_s16((int16_t *)p1, d9s16); + vst1_u8(p1, d[1]); p1 -= stride; - vst1_s16((int16_t *)p2, d10s16); + vst1_u8(p1, d[0]); + vst1_u8(p2, d[2]); p2 += stride; - vst1_s16((int16_t *)p1, d8s16); - vst1_s16((int16_t *)p2, d11s16); - return; + vst1_u8(p2, d[3]); } -#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \ - __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16); -static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2, - int stride, int16x8_t q4s16, - int16x8_t q5s16, - int16x8_t q6s16, - int16x8_t q7s16) { - int16x4_t d4s16, d5s16, d6s16, d7s16; +static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2, + const int stride, + int16x8_t q0, int16x8_t q1, + int16x8_t q2, + int16x8_t q3) { + uint16x8_t d[4]; - d4s16 = vld1_s16((int16_t *)p1); + d[0] = vld1q_u16(p1); p1 += stride; - d7s16 = vld1_s16((int16_t *)p2); + d[1] = vld1q_u16(p1); + d[3] = vld1q_u16(p2); p2 -= stride; - d5s16 = vld1_s16((int16_t *)p1); - d6s16 = vld1_s16((int16_t *)p2); + d[2] = vld1q_u16(p2); - q5s16 = vrshrq_n_s16(q5s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - q7s16 = vrshrq_n_s16(q7s16, 6); - q4s16 = vrshrq_n_s16(q4s16, 6); + q0 = vrshrq_n_s16(q0, 6); + q1 = vrshrq_n_s16(q1, 6); + q2 = vrshrq_n_s16(q2, 6); + q3 = vrshrq_n_s16(q3, 6); - q5s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16))); - q6s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16))); - q7s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16))); - q4s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16))); + q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0])); + q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1])); + q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2])); + q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3])); - d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); - d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); + d[0] = vmovl_u8(vqmovun_s16(q0)); + d[1] = vmovl_u8(vqmovun_s16(q1)); + d[2] = vmovl_u8(vqmovun_s16(q2)); + d[3] = vmovl_u8(vqmovun_s16(q3)); - vst1_s16((int16_t *)p1, d5s16); + vst1q_u16(p1, d[1]); p1 -= stride; - vst1_s16((int16_t *)p2, d6s16); + vst1q_u16(p1, d[0]); + vst1q_u16(p2, d[2]); p2 += stride; - vst1_s16((int16_t *)p2, d7s16); - vst1_s16((int16_t *)p1, d4s16); - return; + vst1q_u16(p2, d[3]); } -#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ - DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); -static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, - int16_t first_const, int16_t second_const, - int16x8_t *qAs16, int16x8_t *qBs16) { - int16x4_t d30s16, d31s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; - int16x4_t dCs16, dDs16, dAs16, dBs16; +static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1, + const int16_t first_const, + const int16_t second_const, + int16x8_t *const qOut0, + int16x8_t *const qOut1) { + int32x4_t q[4]; + int16x4_t d[6]; - dCs16 = vget_low_s16(q14s16); - dDs16 = vget_high_s16(q14s16); - dAs16 = vget_low_s16(q13s16); - dBs16 = vget_high_s16(q13s16); + d[0] = vget_low_s16(qIn0); + d[1] = vget_high_s16(qIn0); + d[2] = vget_low_s16(qIn1); + d[3] = vget_high_s16(qIn1); - d30s16 = vdup_n_s16(first_const); - d31s16 = vdup_n_s16(second_const); + // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9. + d[4] = vdup_n_s16(first_const); + d[5] = vdup_n_s16(second_const); - q8s32 = vmull_s16(dCs16, d30s16); - q10s32 = vmull_s16(dAs16, d31s16); - q9s32 = vmull_s16(dDs16, d30s16); - q11s32 = vmull_s16(dBs16, d31s16); - q12s32 = vmull_s16(dCs16, d31s16); + q[0] = vmull_s16(d[0], d[4]); + q[1] = vmull_s16(d[1], d[4]); + q[0] = vmlsl_s16(q[0], d[2], d[5]); + q[1] = vmlsl_s16(q[1], d[3], d[5]); - q8s32 = vsubq_s32(q8s32, q10s32); - q9s32 = vsubq_s32(q9s32, q11s32); + q[2] = vmull_s16(d[0], d[5]); + q[3] = vmull_s16(d[1], d[5]); + q[2] = vmlal_s16(q[2], d[2], d[4]); + q[3] = vmlal_s16(q[3], d[3], d[4]); - q10s32 = vmull_s16(dDs16, d31s16); - q11s32 = vmull_s16(dAs16, d30s16); - q15s32 = vmull_s16(dBs16, d30s16); - - q11s32 = vaddq_s32(q12s32, q11s32); - q10s32 = vaddq_s32(q10s32, q15s32); - - *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14)); - return; + *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS), + vrshrn_n_s32(q[1], DCT_CONST_BITS)); + *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS), + vrshrn_n_s32(q[3], DCT_CONST_BITS)); } -static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) { - int16_t *in; +static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0, + int16x8_t *const s1, int16x8_t *const s2, + int16x8_t *const s3, int16x8_t *const s4, + int16x8_t *const s5, int16x8_t *const s6, + int16x8_t *const s7) { + *s0 = vld1q_s16(in); + in += 32; + *s1 = vld1q_s16(in); + in += 32; + *s2 = vld1q_s16(in); + in += 32; + *s3 = vld1q_s16(in); + in += 32; + *s4 = vld1q_s16(in); + in += 32; + *s5 = vld1q_s16(in); + in += 32; + *s6 = vld1q_s16(in); + in += 32; + *s7 = vld1q_s16(in); +} + +static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1, + int16x8_t a2, int16x8_t a3, + int16x8_t a4, int16x8_t a5, + int16x8_t a6, int16x8_t a7, + int16_t **out) { + transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + vst1q_s16(*out, a0); + *out += 8; + vst1q_s16(*out, a1); + *out += 8; + vst1q_s16(*out, a2); + *out += 8; + vst1q_s16(*out, a3); + *out += 8; + vst1q_s16(*out, a4); + *out += 8; + vst1q_s16(*out, a5); + *out += 8; + vst1q_s16(*out, a6); + *out += 8; + vst1q_s16(*out, a7); + *out += 8; +} + +static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) { int i; - const int stride = 32; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; for (i = 0; i < 4; i++, input += 8) { - in = input; - q8s16 = vld1q_s16(in); - in += stride; - q9s16 = vld1q_s16(in); - in += stride; - q10s16 = vld1q_s16(in); - in += stride; - q11s16 = vld1q_s16(in); - in += stride; - q12s16 = vld1q_s16(in); - in += stride; - q13s16 = vld1q_s16(in); - in += stride; - q14s16 = vld1q_s16(in); - in += stride; - q15s16 = vld1q_s16(in); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - vst1q_s16(t_buf, q8s16); - t_buf += 8; - vst1q_s16(t_buf, q9s16); - t_buf += 8; - vst1q_s16(t_buf, q10s16); - t_buf += 8; - vst1q_s16(t_buf, q11s16); - t_buf += 8; - vst1q_s16(t_buf, q12s16); - t_buf += 8; - vst1q_s16(t_buf, q13s16); - t_buf += 8; - vst1q_s16(t_buf, q14s16); - t_buf += 8; - vst1q_s16(t_buf, q15s16); - t_buf += 8; + load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); } - return; } -static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16, - int16x8_t q3s16, int16x8_t q6s16, - int16x8_t q7s16, int16x8_t q8s16, - int16x8_t q9s16, int16x8_t q10s16, - int16x8_t q11s16, int16x8_t q12s16, - int16x8_t q13s16, int16x8_t q14s16, - int16x8_t q15s16) { - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); - STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); - - LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); - STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); - - LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); - STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); - - LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); - STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); - - LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); - STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); - - LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); - STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); - - LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); - STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); - - LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); - STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); - return; +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void load_s16x8q_tran_low( + const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, + int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) { + *s0 = load_tran_low_to_s16q(in); + in += 32; + *s1 = load_tran_low_to_s16q(in); + in += 32; + *s2 = load_tran_low_to_s16q(in); + in += 32; + *s3 = load_tran_low_to_s16q(in); + in += 32; + *s4 = load_tran_low_to_s16q(in); + in += 32; + *s5 = load_tran_low_to_s16q(in); + in += 32; + *s6 = load_tran_low_to_s16q(in); + in += 32; + *s7 = load_tran_low_to_s16q(in); } -static INLINE void idct32_bands_end_2nd_pass( - int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16, - int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16, - int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16, - int16x8_t q14s16, int16x8_t q15s16) { - uint8_t *r6 = dest + 31 * stride; - uint8_t *r7 = dest /* + 0 * stride*/; - uint8_t *r9 = dest + 15 * stride; - uint8_t *r10 = dest + 16 * stride; - int str2 = stride << 1; - int16x8_t q0s16, q1s16, q4s16, q5s16; +static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input, + int16_t *t_buf) { + int i; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; + for (i = 0; i < 4; i++, input += 8) { + load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); + } +} +#else // !CONFIG_VP9_HIGHBITDEPTH +#define idct32_transpose_pair_tran_low idct32_transpose_pair +#endif // CONFIG_VP9_HIGHBITDEPTH - LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; +static INLINE void idct32_bands_end_1st_pass(int16_t *const out, + int16x8_t *const q) { + store_in_output(out, 16, 17, q[6], q[7]); + store_in_output(out, 14, 15, q[8], q[9]); - LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 30, 31, q[6], q[7]); + store_in_output(out, 0, 1, q[4], q[5]); - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); - LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 18, 19, q[6], q[7]); + store_in_output(out, 12, 13, q[8], q[9]); - LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 28, 29, q[6], q[7]); + store_in_output(out, 2, 3, q[4], q[5]); - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); - LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 20, 21, q[6], q[7]); + store_in_output(out, 10, 11, q[8], q[9]); - LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 26, 27, q[6], q[7]); + store_in_output(out, 4, 5, q[4], q[5]); - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); - LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - return; + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 22, 23, q[6], q[7]); + store_in_output(out, 8, 9, q[8], q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 24, 25, q[6], q[7]); + store_in_output(out, 6, 7, q[4], q[5]); } -void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) { +static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out, + uint8_t *const dest, + const int stride, + int16x8_t *const q) { + uint8_t *dest0 = dest + 0 * stride; + uint8_t *dest1 = dest + 31 * stride; + uint8_t *dest2 = dest + 16 * stride; + uint8_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; + + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); +} + +static INLINE void highbd_idct32_bands_end_2nd_pass_bd8( + const int16_t *const out, uint16_t *const dest, const int stride, + int16x8_t *const q) { + uint16_t *dest0 = dest + 0 * stride; + uint16_t *dest1 = dest + 31 * stride; + uint16_t *dest2 = dest + 16 * stride; + uint16_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; + + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); +} + +void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, + const int stride, const int highbd_flag) { int i, idct32_pass_loop; int16_t trans_buf[32 * 8]; int16_t pass1[32 * 32]; int16_t pass2[32 * 32]; + const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1 int16_t *out; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int16x8_t q[16]; + uint16_t *dst = CAST_TO_SHORTPTR(dest); for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; - idct32_pass_loop++, - input = pass1, // the input of pass2 is the result of pass1 - out = pass2) { - for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop - idct32_transpose_pair(input, trans_buf); + idct32_pass_loop++, out = pass2) { + for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop + if (idct32_pass_loop == 0) { + idct32_transpose_pair_tran_low(input, trans_buf); + input += 32 * 8; + } else { + idct32_transpose_pair(input_pass2, trans_buf); + input_pass2 += 32 * 8; + } // ----------------------------------------- // BLOCK A: 16-19,28-31 // ----------------------------------------- // generate 16,17,30,31 // part of stage 1 - LOAD_FROM_TRANSPOSED(0, 1, 31) - DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(31, 17, 15) - DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]); // part of stage 2 - q4s16 = vaddq_s16(q0s16, q1s16); - q13s16 = vsubq_s16(q0s16, q1s16); - q6s16 = vaddq_s16(q2s16, q3s16); - q14s16 = vsubq_s16(q2s16, q3s16); + q[4] = vaddq_s16(q[0], q[1]); + q[13] = vsubq_s16(q[0], q[1]); + q[6] = vaddq_s16(q[2], q[3]); + q[14] = vsubq_s16(q[2], q[3]); // part of stage 3 - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]); // generate 18,19,28,29 // part of stage 1 - LOAD_FROM_TRANSPOSED(15, 9, 23) - DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(23, 25, 7) - DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]); // part of stage 2 - q13s16 = vsubq_s16(q3s16, q2s16); - q3s16 = vaddq_s16(q3s16, q2s16); - q14s16 = vsubq_s16(q1s16, q0s16); - q2s16 = vaddq_s16(q1s16, q0s16); + q[13] = vsubq_s16(q[3], q[2]); + q[3] = vaddq_s16(q[3], q[2]); + q[14] = vsubq_s16(q[1], q[0]); + q[2] = vaddq_s16(q[1], q[0]); // part of stage 3 - DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) + do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]); // part of stage 4 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q10s16 = vaddq_s16(q7s16, q1s16); - q15s16 = vaddq_s16(q6s16, q3s16); - q13s16 = vsubq_s16(q5s16, q0s16); - q14s16 = vsubq_s16(q7s16, q1s16); - STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) - STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) + q[8] = vaddq_s16(q[4], q[2]); + q[9] = vaddq_s16(q[5], q[0]); + q[10] = vaddq_s16(q[7], q[1]); + q[15] = vaddq_s16(q[6], q[3]); + q[13] = vsubq_s16(q[5], q[0]); + q[14] = vsubq_s16(q[7], q[1]); + store_in_output(out, 16, 31, q[8], q[15]); + store_in_output(out, 17, 30, q[9], q[10]); // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) - STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]); + store_in_output(out, 29, 18, q[1], q[0]); // part of stage 4 - q13s16 = vsubq_s16(q4s16, q2s16); - q14s16 = vsubq_s16(q6s16, q3s16); + q[13] = vsubq_s16(q[4], q[2]); + q[14] = vsubq_s16(q[6], q[3]); // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) - STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]); + store_in_output(out, 19, 28, q[4], q[6]); // ----------------------------------------- // BLOCK B: 20-23,24-27 // ----------------------------------------- // generate 20,21,26,27 // part of stage 1 - LOAD_FROM_TRANSPOSED(7, 5, 27) - DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(27, 21, 11) - DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]); // part of stage 2 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); // part of stage 3 - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); // generate 22,23,24,25 // part of stage 1 - LOAD_FROM_TRANSPOSED(11, 13, 19) - DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(19, 29, 3) - DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) + load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]); // part of stage 2 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); + q[14] = vsubq_s16(q[4], q[5]); + q[5] = vaddq_s16(q[4], q[5]); + q[13] = vsubq_s16(q[6], q[7]); + q[6] = vaddq_s16(q[6], q[7]); // part of stage 3 - DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) + do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]); // part of stage 4 - q10s16 = vaddq_s16(q7s16, q1s16); - q11s16 = vaddq_s16(q5s16, q0s16); - q12s16 = vaddq_s16(q6s16, q2s16); - q15s16 = vaddq_s16(q4s16, q3s16); + q[10] = vaddq_s16(q[7], q[1]); + q[11] = vaddq_s16(q[5], q[0]); + q[12] = vaddq_s16(q[6], q[2]); + q[15] = vaddq_s16(q[4], q[3]); // part of stage 6 - LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q11s16); - q9s16 = vaddq_s16(q13s16, q10s16); - q13s16 = vsubq_s16(q13s16, q10s16); - q11s16 = vsubq_s16(q14s16, q11s16); - STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) - LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) - q8s16 = vsubq_s16(q9s16, q12s16); - q10s16 = vaddq_s16(q14s16, q15s16); - q14s16 = vsubq_s16(q14s16, q15s16); - q12s16 = vaddq_s16(q9s16, q12s16); - STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) + load_from_output(out, 16, 17, &q[14], &q[13]); + q[8] = vaddq_s16(q[14], q[11]); + q[9] = vaddq_s16(q[13], q[10]); + q[13] = vsubq_s16(q[13], q[10]); + q[11] = vsubq_s16(q[14], q[11]); + store_in_output(out, 17, 16, q[9], q[8]); + load_from_output(out, 30, 31, &q[14], &q[9]); + q[8] = vsubq_s16(q[9], q[12]); + q[10] = vaddq_s16(q[14], q[15]); + q[14] = vsubq_s16(q[14], q[15]); + q[12] = vaddq_s16(q[9], q[12]); + store_in_output(out, 30, 31, q[10], q[12]); // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) - q13s16 = q11s16; - q14s16 = q8s16; - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 25, 22, q[14], q[13]); + do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 24, 23, q[14], q[13]); // part of stage 4 - q14s16 = vsubq_s16(q5s16, q0s16); - q13s16 = vsubq_s16(q6s16, q2s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); - q14s16 = vsubq_s16(q7s16, q1s16); - q13s16 = vsubq_s16(q4s16, q3s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); + q[14] = vsubq_s16(q[5], q[0]); + q[13] = vsubq_s16(q[6], q[2]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]); + q[14] = vsubq_s16(q[7], q[1]); + q[13] = vsubq_s16(q[4], q[3]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]); // part of stage 6 - LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q1s16); - q9s16 = vaddq_s16(q13s16, q6s16); - q13s16 = vsubq_s16(q13s16, q6s16); - q1s16 = vsubq_s16(q14s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) - LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) - q14s16 = vsubq_s16(q8s16, q5s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q9s16, q0s16); - q0s16 = vsubq_s16(q9s16, q0s16); - STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) + load_from_output(out, 18, 19, &q[14], &q[13]); + q[8] = vaddq_s16(q[14], q[1]); + q[9] = vaddq_s16(q[13], q[6]); + q[13] = vsubq_s16(q[13], q[6]); + q[1] = vsubq_s16(q[14], q[1]); + store_in_output(out, 18, 19, q[8], q[9]); + load_from_output(out, 28, 29, &q[8], &q[9]); + q[14] = vsubq_s16(q[8], q[5]); + q[10] = vaddq_s16(q[8], q[5]); + q[11] = vaddq_s16(q[9], q[0]); + q[0] = vsubq_s16(q[9], q[0]); + store_in_output(out, 28, 29, q[10], q[11]); // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) - DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16); - STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 20, 27, q[13], q[14]); + do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]); + store_in_output(out, 21, 26, q[1], q[0]); // ----------------------------------------- // BLOCK C: 8-10,11-15 // ----------------------------------------- // generate 8,9,14,15 // part of stage 2 - LOAD_FROM_TRANSPOSED(3, 2, 30) - DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(30, 18, 14) - DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]); // part of stage 3 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); // part of stage 4 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]); // generate 10,11,12,13 // part of stage 2 - LOAD_FROM_TRANSPOSED(14, 10, 22) - DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(22, 26, 6) - DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) + load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]); // part of stage 3 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); + q[14] = vsubq_s16(q[4], q[5]); + q[5] = vaddq_s16(q[4], q[5]); + q[13] = vsubq_s16(q[6], q[7]); + q[6] = vaddq_s16(q[6], q[7]); // part of stage 4 - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]); // part of stage 5 - q8s16 = vaddq_s16(q0s16, q5s16); - q9s16 = vaddq_s16(q1s16, q7s16); - q13s16 = vsubq_s16(q1s16, q7s16); - q14s16 = vsubq_s16(q3s16, q4s16); - q10s16 = vaddq_s16(q3s16, q4s16); - q15s16 = vaddq_s16(q2s16, q6s16); - STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) - STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) + q[8] = vaddq_s16(q[0], q[5]); + q[9] = vaddq_s16(q[1], q[7]); + q[13] = vsubq_s16(q[1], q[7]); + q[14] = vsubq_s16(q[3], q[4]); + q[10] = vaddq_s16(q[3], q[4]); + q[15] = vaddq_s16(q[2], q[6]); + store_in_output(out, 8, 15, q[8], q[15]); + store_in_output(out, 9, 14, q[9], q[10]); // part of stage 6 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) - q13s16 = vsubq_s16(q0s16, q5s16); - q14s16 = vsubq_s16(q2s16, q6s16); - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 13, 10, q[3], q[1]); + q[13] = vsubq_s16(q[0], q[5]); + q[14] = vsubq_s16(q[2], q[6]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 11, 12, q[1], q[3]); // ----------------------------------------- // BLOCK D: 0-3,4-7 // ----------------------------------------- // generate 4,5,6,7 // part of stage 3 - LOAD_FROM_TRANSPOSED(6, 4, 28) - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(28, 20, 12) - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); // part of stage 4 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); // part of stage 5 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); // generate 0,1,2,3 // part of stage 4 - LOAD_FROM_TRANSPOSED(12, 0, 16) - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(16, 8, 24) - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) + load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]); // part of stage 5 - q4s16 = vaddq_s16(q7s16, q6s16); - q7s16 = vsubq_s16(q7s16, q6s16); - q6s16 = vsubq_s16(q5s16, q14s16); - q5s16 = vaddq_s16(q5s16, q14s16); + q[4] = vaddq_s16(q[7], q[6]); + q[7] = vsubq_s16(q[7], q[6]); + q[6] = vsubq_s16(q[5], q[14]); + q[5] = vaddq_s16(q[5], q[14]); // part of stage 6 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q3s16); - q10s16 = vaddq_s16(q6s16, q1s16); - q11s16 = vaddq_s16(q7s16, q0s16); - q12s16 = vsubq_s16(q7s16, q0s16); - q13s16 = vsubq_s16(q6s16, q1s16); - q14s16 = vsubq_s16(q5s16, q3s16); - q15s16 = vsubq_s16(q4s16, q2s16); + q[8] = vaddq_s16(q[4], q[2]); + q[9] = vaddq_s16(q[5], q[3]); + q[10] = vaddq_s16(q[6], q[1]); + q[11] = vaddq_s16(q[7], q[0]); + q[12] = vsubq_s16(q[7], q[0]); + q[13] = vsubq_s16(q[6], q[1]); + q[14] = vsubq_s16(q[5], q[3]); + q[15] = vsubq_s16(q[4], q[2]); // part of stage 7 - LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) - q2s16 = vaddq_s16(q8s16, q1s16); - q3s16 = vaddq_s16(q9s16, q0s16); - q4s16 = vsubq_s16(q9s16, q0s16); - q5s16 = vsubq_s16(q8s16, q1s16); - LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); + load_from_output(out, 14, 15, &q[0], &q[1]); + q[2] = vaddq_s16(q[8], q[1]); + q[3] = vaddq_s16(q[9], q[0]); + q[4] = vsubq_s16(q[9], q[0]); + q[5] = vsubq_s16(q[8], q[1]); + load_from_output(out, 16, 17, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); if (idct32_pass_loop == 0) { - idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, - q10s16, q11s16, q12s16, q13s16, q14s16, - q15s16); + idct32_bands_end_1st_pass(out, q); } else { - idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16, - q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, - q14s16, q15s16); - dest += 8; + if (highbd_flag) { + highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q); + dst += 8; + } else { + idct32_bands_end_2nd_pass(out, dest, stride, q); + dest += 8; + } } } } - return; +} + +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + vpx_idct32_32_neon(input, dest, stride, 0); } diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm index adab715dde..d83421e9e6 100644 --- a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm +++ b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm @@ -15,19 +15,17 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) +;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input ; r1 uint8_t *dest -; r2 int dest_stride) +; r2 int stride) |vpx_idct4x4_1_add_neon| PROC ldrsh r0, [r0] - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 + ; cospi_16_64 = 11585 + movw r12, #0x2d41 ; out = dct_const_round_shift(input[0] * cospi_16_64) mul r0, r0, r12 ; input[0] * cospi_16_64 diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c index 9f999e979d..a14b895431 100644 --- a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -9,38 +9,39 @@ */ #include +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d6u8; - uint32x2_t d2u32 = vdup_n_u32(0); - uint16x8_t q8u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 4); - - q0s16 = vdupq_n_s16(a1); - - // dc_only_idct_add - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); - d1 += dest_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); - d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); - d2 += dest_stride; - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); - d2 += dest_stride; - } - return; +static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, + const int16x8_t res, + uint32x2_t *const d) { + uint16x8_t a; + uint8x8_t b; + *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0); + *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1); + a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d)); + b = vqmovun_s16(vreinterpretq_s16_u16(a)); + vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0); + *dest += stride; + vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1); + *dest += stride; +} + +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); + const int16x8_t dc = vdupq_n_s16(a1); + uint32x2_t d = vdup_n_u32(0); + + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + idct4x4_1_add_kernel(&dest, stride, dc, &d); + idct4x4_1_add_kernel(&dest, stride, dc, &d); } diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm index 877fbd6343..184d218941 100644 --- a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm +++ b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm @@ -15,12 +15,14 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 + INCLUDE vpx_dsp/arm/idct_neon.asm.S + AREA Block, CODE, READONLY ; name this block of code -;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input ; r1 uint8_t *dest -; r2 int dest_stride) +; r2 int stride) |vpx_idct4x4_16_add_neon| PROC @@ -33,18 +35,15 @@ ; So, two passes of a transpose followed by a column transform. ; load the inputs into q8-q9, d16-d19 - vld1.s16 {q8,q9}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 ; generate scalar constants - ; cospi_8_64 = 15137 = 0x3b21 - mov r0, #0x3b00 - add r0, #0x21 - ; cospi_16_64 = 11585 = 0x2d41 - mov r3, #0x2d00 - add r3, #0x41 - ; cospi_24_64 = 6270 = 0x 187e - mov r12, #0x1800 - add r12, #0x7e + ; cospi_8_64 = 15137 + movw r0, #0x3b21 + ; cospi_16_64 = 11585 + movw r3, #0x2d41 + ; cospi_24_64 = 6270 + movw r12, #0x187e ; transpose the input data ; 00 01 02 03 d16 @@ -73,16 +72,15 @@ ; do the transform on transposed rows ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 ; (input[0] + input[2]) * cospi_16_64; ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 + vmull.s16 q8, d16, d21 + vmull.s16 q14, d18, d21 + vadd.s32 q13, q8, q14 + vsub.s32 q14, q8, q14 ; input[1] * cospi_24_64 - input[3] * cospi_8_64; ; input[1] * cospi_8_64 + input[3] * cospi_24_64; @@ -90,10 +88,10 @@ vmlal.s16 q1, d19, d22 ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 + vrshrn.s32 d26, q13, #14 + vrshrn.s32 d27, q14, #14 + vrshrn.s32 d29, q15, #14 + vrshrn.s32 d28, q1, #14 ; stage 2 ; output[0] = step[0] + step[3]; @@ -141,10 +139,10 @@ vmlal.s16 q1, d19, d22 ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 + vrshrn.s32 d26, q13, #14 + vrshrn.s32 d27, q14, #14 + vrshrn.s32 d29, q15, #14 + vrshrn.s32 d28, q1, #14 ; stage 2 ; output[0] = step[0] + step[3]; @@ -169,7 +167,7 @@ vld1.32 {d27[1]}, [r1], r2 vld1.32 {d27[0]}, [r1] ; no post-increment - ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i] vaddw.u8 q8, q8, d26 vaddw.u8 q9, q9, d27 diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c index 382626928b..673a36840e 100644 --- a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c @@ -9,138 +9,54 @@ */ #include +#include -void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d26u8, d27u8; - uint32x2_t d26u32, d27u32; - uint16x8_t q8u16, q9u16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; - int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; - int16x8_t q8s16, q9s16, q13s16, q14s16; - int32x4_t q1s32, q13s32, q14s32, q15s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - uint8_t *d; - int16_t cospi_8_64 = 15137; - int16_t cospi_16_64 = 11585; - int16_t cospi_24_64 = 6270; +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" - d26u32 = d27u32 = vdup_n_u32(0); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const uint8_t *dst = dest; + const int16x4_t cospis = vld1_s16(kCospi); + uint8x8_t dest01_u8; + uint32x2_t dest32_u32 = vdup_n_u32(0); + int16x8_t a0, a1; + uint8x8_t d01, d32; + uint16x8_t d01_u16, d32_u16; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); + // Rows + a0 = load_tran_low_to_s16q(input); + a1 = load_tran_low_to_s16q(input + 8); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + // Columns + a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a0 = vrshrq_n_s16(a0, 4); + a1 = vrshrq_n_s16(a1, 4); - d20s16 = vdup_n_s16(cospi_8_64); - d21s16 = vdup_n_s16(cospi_16_64); + dest01_u8 = load_u8(dst, stride); + dst += 2 * stride; + // The elements are loaded in reverse order. + dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1); + dst += stride; + dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0); - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8); + d32_u16 = + vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32)); + d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16)); + d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16)); - d22s16 = vdup_n_s16(cospi_24_64); - - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_high_s16(q9s16); // vswp d18 d19 - d19s16 = vget_low_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - // do the transform on columns - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d = dest; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); - d += dest_stride; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - d = dest; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); - return; + store_u8(dest, stride, d01); + dest += 2 * stride; + // The elements are stored in reverse order. + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1); + dest += stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0); } diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm deleted file mode 100644 index dbbff364f3..0000000000 --- a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm +++ /dev/null @@ -1,88 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vpx_idct8x8_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vpx_idct8x8_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 5) - add r0, r0, #16 ; + (1 <<((5) - 1)) - asr r0, r0, #5 ; >> 5 - - vdup.s16 q0, r0 ; duplicate a1 - - ; load destination data - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r2 - vld1.64 {d17}, [r1] - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |vpx_idct8x8_1_add_neon| - - END diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c index e3db0b876b..ce9b459589 100644 --- a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -10,52 +10,56 @@ #include +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 5); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d5u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - return; +static INLINE uint8x8_t create_dcd(const int16_t dc) { + int16x8_t t = vdupq_n_s16(dc); + return vqmovun_s16(t); +} + +static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x8_t res) { + const uint8x8_t a = vld1_u8(*dest); + const uint8x8_t b = vqadd_u8(a, res); + vst1_u8(*dest, b); + *dest += stride; +} + +static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x8_t res) { + const uint8x8_t a = vld1_u8(*dest); + const uint8x8_t b = vqsub_u8(a, res); + vst1_u8(*dest, b); + *dest += stride; +} + +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); + + if (a1 >= 0) { + const uint8x8_t dc = create_dcd(a1); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + } else { + const uint8x8_t dc = create_dcd(-a1); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + } } diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm deleted file mode 100644 index 6ab59b41b7..0000000000 --- a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm +++ /dev/null @@ -1,519 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vpx_idct8x8_64_add_neon| - EXPORT |vpx_idct8x8_12_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are - ; loaded in q8-q15. The output will be stored back into q8-q15 registers. - ; This macro will touch q0-q7 registers and use them as buffer during - ; calculation. - MACRO - IDCT8x8_1D - ; stage 1 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r4 ; duplicate cospi_4_64 - vdup.16 d2, r5 ; duplicate cospi_12_64 - vdup.16 d3, r6 ; duplicate cospi_20_64 - - ; input[1] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; input[5] * cospi_12_64 - vmull.s16 q5, d26, d2 - vmull.s16 q6, d27, d2 - - ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q5, d22, d3 - vmlsl.s16 q6, d23, d3 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q5, #14 ; >> 14 - vqrshrn.s32 d11, q6, #14 ; >> 14 - - ; input[1] * cospi_4_64 - vmull.s16 q2, d18, d1 - vmull.s16 q3, d19, d1 - - ; input[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q13, d27, d3 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vmlal.s16 q2, d30, d0 - vmlal.s16 q3, d31, d0 - - ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q13, d23, d2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d14, q2, #14 ; >> 14 - vqrshrn.s32 d15, q3, #14 ; >> 14 - - ; stage 2 & stage 3 - even half - vdup.16 d0, r7 ; duplicate cospi_16_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q13, #14 ; >> 14 - - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 - - ; input[0] * cospi_16_64 - vmull.s16 q13, d16, d0 - vmull.s16 q15, d17, d0 - - ; (input[0] + input[2]) * cospi_16_64 - vmlal.s16 q2, d24, d0 - vmlal.s16 q3, d25, d0 - - ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q13, d24, d0 - vmlsl.s16 q15, d25, d0 - - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d18, q2, #14 ; >> 14 - vqrshrn.s32 d19, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d22, q13, #14 ; >> 14 - vqrshrn.s32 d23, q15, #14 ; >> 14 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - ; input[1] * cospi_24_64 - vmull.s16 q2, d20, d0 - vmull.s16 q3, d21, d0 - - ; input[1] * cospi_8_64 - vmull.s16 q8, d20, d1 - vmull.s16 q12, d21, d1 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q2, d28, d1 - vmlsl.s16 q3, d29, d1 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q8, d28, d0 - vmlal.s16 q12, d29, d0 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d26, q2, #14 ; >> 14 - vqrshrn.s32 d27, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d30, q8, #14 ; >> 14 - vqrshrn.s32 d31, q12, #14 ; >> 14 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - MEND - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vpx_idct8x8_64_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - IDCT8x8_1D - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |vpx_idct8x8_64_add_neon| - -;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|vpx_idct8x8_12_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - ; stage 1 - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling - ; multiply and shift the result by 16 bits instead of 14 bits. So we need - ; to double the constants before multiplying to compensate this. - mov r12, r3, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_28_64*2 - mov r12, r4, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; dct_const_round_shift(input[1] * cospi_28_64) - vqrdmulh.s16 q4, q9, q0 - - mov r12, r6, lsl #1 - rsb r12, #0 - vdup.16 q0, r12 ; duplicate -cospi_20_64*2 - - ; dct_const_round_shift(input[1] * cospi_4_64) - vqrdmulh.s16 q7, q9, q1 - - mov r12, r5, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_12_64*2 - - ; dct_const_round_shift(- input[3] * cospi_20_64) - vqrdmulh.s16 q5, q11, q0 - - mov r12, r7, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_16_64*2 - - ; dct_const_round_shift(input[3] * cospi_12_64) - vqrdmulh.s16 q6, q11, q1 - - ; stage 2 & stage 3 - even half - mov r12, r8, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_24_64*2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrdmulh.s16 q9, q8, q0 - - mov r12, r9, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_8_64*2 - - ; dct_const_round_shift(input[1] * cospi_24_64) - vqrdmulh.s16 q13, q10, q1 - - ; dct_const_round_shift(input[1] * cospi_8_64) - vqrdmulh.s16 q15, q10, q0 - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |vpx_idct8x8_12_add_neon| - - END diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c index 82b3182568..1121ade279 100644 --- a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c +++ b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c @@ -11,435 +11,117 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; +static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, + int16x8_t a3, int16x8_t a4, int16x8_t a5, + int16x8_t a6, int16x8_t a7, uint8_t *dest, + const int stride) { + const uint8_t *dst = dest; + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; + uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); + a0 = vrshrq_n_s16(a0, 5); + a1 = vrshrq_n_s16(a1, 5); + a2 = vrshrq_n_s16(a2, 5); + a3 = vrshrq_n_s16(a3, 5); + a4 = vrshrq_n_s16(a4, 5); + a5 = vrshrq_n_s16(a5, 5); + a6 = vrshrq_n_s16(a6, 5); + a7 = vrshrq_n_s16(a7, 5); - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); + d0 = vld1_u8(dst); + dst += stride; + d1 = vld1_u8(dst); + dst += stride; + d2 = vld1_u8(dst); + dst += stride; + d3 = vld1_u8(dst); + dst += stride; + d4 = vld1_u8(dst); + dst += stride; + d5 = vld1_u8(dst); + dst += stride; + d6 = vld1_u8(dst); + dst += stride; + d7 = vld1_u8(dst); - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); + d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0); + d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1); + d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2); + d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3); + d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4); + d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5); + d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6); + d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7); - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16)); + d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16)); + d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16)); + d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16)); + d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16)); + d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16)); + d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16)); + d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16)); - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vqrshrn_n_s32(q2s32, 14); - d15s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); - - d18s16 = vqrshrn_n_s32(q2s32, 14); - d19s16 = vqrshrn_n_s32(q3s32, 14); - d22s16 = vqrshrn_n_s32(q13s32, 14); - d23s16 = vqrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vqrshrn_n_s32(q2s32, 14); - d27s16 = vqrshrn_n_s32(q3s32, 14); - d30s16 = vqrshrn_n_s32(q8s32, 14); - d31s16 = vqrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); - return; + vst1_u8(dest, d0); + dest += stride; + vst1_u8(dest, d1); + dest += stride; + vst1_u8(dest, d2); + dest += stride; + vst1_u8(dest, d3); + dest += stride; + vst1_u8(dest, d4); + dest += stride; + vst1_u8(dest, d5); + dest += stride; + vst1_u8(dest, d6); + dest += stride; + vst1_u8(dest, d7); } -void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t a0 = load_tran_low_to_s16q(input); + int16x8_t a1 = load_tran_low_to_s16q(input + 8); + int16x8_t a2 = load_tran_low_to_s16q(input + 16); + int16x8_t a3 = load_tran_low_to_s16q(input + 24); + int16x8_t a4 = load_tran_low_to_s16q(input + 32); + int16x8_t a5 = load_tran_low_to_s16q(input + 40); + int16x8_t a6 = load_tran_low_to_s16q(input + 48); + int16x8_t a7 = load_tran_low_to_s16q(input + 56); - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; + idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride); } -void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - int32x4_t q9s32, q10s32, q11s32, q12s32; +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x8_t cospisd = vaddq_s16(cospis, cospis); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 + const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 + int16x4_t a0, a1, a2, a3, a4, a5, a6, a7; + int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); + a0 = load_tran_low_to_s16d(input); + a1 = load_tran_low_to_s16d(input + 8); + a2 = load_tran_low_to_s16d(input + 16); + a3 = load_tran_low_to_s16d(input + 24); - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // First transform rows - // stage 1 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - - q0s16 = vdupq_n_s16(-cospi_20_64 * 2); - - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - q1s16 = vdupq_n_s16(cospi_12_64 * 2); - - q5s16 = vqrdmulhq_s16(q11s16, q0s16); - - q0s16 = vdupq_n_s16(cospi_16_64 * 2); - - q6s16 = vqrdmulhq_s16(q11s16, q1s16); - - // stage 2 & stage 3 - even half - q1s16 = vdupq_n_s16(cospi_24_64 * 2); - - q9s16 = vqrdmulhq_s16(q8s16, q0s16); - - q0s16 = vdupq_n_s16(cospi_8_64 * 2); - - q13s16 = vqrdmulhq_s16(q10s16, q1s16); - - q15s16 = vqrdmulhq_s16(q10s16, q0s16); - - // stage 3 -odd half - q0s16 = vaddq_s16(q9s16, q15s16); - q1s16 = vaddq_s16(q9s16, q13s16); - q2s16 = vsubq_s16(q9s16, q13s16); - q3s16 = vsubq_s16(q9s16, q15s16); - - // stage 2 - odd half - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - q8s16 = vaddq_s16(q0s16, q7s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q7s16); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4, + &a5, &a6, &a7); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6, + a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); + add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride); } diff --git a/libs/libvpx/vpx_dsp/arm/idct_neon.asm b/libs/libvpx/vpx_dsp/arm/idct_neon.asm new file mode 100644 index 0000000000..5dd9bdc788 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/idct_neon.asm @@ -0,0 +1,46 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + INCLUDE ./vpx_config.asm + + ; Helper functions used to load tran_low_t into int16, narrowing if + ; necessary. + + ; $dst0..3 are d registers with the pairs assumed to be contiguous in + ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld1.s32 {q0,q1}, [$src]! + vld1.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q1 + vmovn.i32 $dst2, q2 + vmovn.i32 $dst3, q3 + ELSE + vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]! + ENDIF + MEND + + ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld2.s32 {q0,q1}, [$src]! + vld2.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q2 + vmovn.i32 $dst2, q1 + vmovn.i32 $dst3, q3 + ELSE + vld2.s16 {$dst0,$dst1,$dst2,$dst3}, [$src]! + ENDIF + MEND + END diff --git a/libs/libvpx/vpx_dsp/arm/idct_neon.h b/libs/libvpx/vpx_dsp/arm/idct_neon.h new file mode 100644 index 0000000000..6ed02af5ac --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/idct_neon.h @@ -0,0 +1,940 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_IDCT_NEON_H_ +#define VPX_DSP_ARM_IDCT_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" + +static const int16_t kCospi[16] = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, + 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, + 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, + 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, + 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ +}; + +static const int32_t kCospi32[16] = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, + 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, + 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, + 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, + 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ +}; + +//------------------------------------------------------------------------------ +// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth +static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) { +#if CONFIG_VP9_HIGHBITDEPTH + return vqaddq_s16(a, b); +#else + return vaddq_s16(a, b); +#endif +} + +static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) { +#if CONFIG_VP9_HIGHBITDEPTH + return vqsubq_s16(a, b); +#else + return vsubq_s16(a, b); +#endif +} + +//------------------------------------------------------------------------------ + +static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0, + const int32x4x2_t s1) { + int32x4x2_t t; + t.val[0] = vaddq_s32(s0.val[0], s1.val[0]); + t.val[1] = vaddq_s32(s0.val[1], s1.val[1]); + return t; +} + +static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0, + const int32x4x2_t s1) { + int32x4x2_t t; + t.val[0] = vsubq_s32(s0.val[0], s1.val[0]); + t.val[1] = vsubq_s32(s0.val[1], s1.val[1]); + return t; +} + +//------------------------------------------------------------------------------ + +// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. +static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, + const int16_t a_const) { + // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed + // streams. See WRAPLOW and dct_const_round_shift for details. + // This instruction doubles the result and returns the high half, essentially + // resulting in a right shift by 15. By multiplying the constant first that + // becomes a right shift by DCT_CONST_BITS. + // The largest possible value used here is + // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just* + // within the range of int16_t (+32767 / -32768) even when negated. + return vqrdmulhq_n_s16(a, a_const * 2); +} + +// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. +static INLINE int16x8_t add_multiply_shift_and_narrow_s16( + const int16x8_t a, const int16x8_t b, const int16_t ab_const) { + // In both add_ and it's pair, sub_, the input for well-formed streams will be + // well within 16 bits (input to the idct is the difference between two frames + // and will be within -255 to 255, or 9 bits) + // However, for inputs over about 25,000 (valid for int16_t, but not for idct + // input) this function can not use vaddq_s16. + // In order to match existing behavior and intentionally out of range tests, + // expand the addition up to 32 bits to prevent truncation. + int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); + int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); + temp_low = vmulq_n_s32(temp_low, ab_const); + temp_high = vmulq_n_s32(temp_high, ab_const); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); +} + +// Subtract b from a, then multiply by ab_const. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( + const int16x8_t a, const int16x8_t b, const int16_t ab_const) { + int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); + int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); + temp_low = vmulq_n_s32(temp_low, ab_const); + temp_high = vmulq_n_s32(temp_high, ab_const); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); +} + +// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( + const int16x8_t a, const int16_t a_const, const int16x8_t b, + const int16_t b_const) { + int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const); + int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); + temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); + temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); +} + +//------------------------------------------------------------------------------ + +// Note: The following 4 functions could use 32-bit operations for bit-depth 10. +// However, although it's 20% faster with gcc, it's 20% slower with clang. +// Use 64-bit operations for now. + +// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. +static INLINE int32x4x2_t +multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) { + int64x2_t b[4]; + int32x4x2_t c; + b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); + b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); + b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); + b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); + c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS), + vrshrn_n_s64(b[1], DCT_CONST_BITS)); + c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS), + vrshrn_n_s64(b[3], DCT_CONST_BITS)); + return c; +} + +// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. +static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { + const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]); + const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]); + int64x2_t c[4]; + int32x4x2_t d; + c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const); + c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const); + c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const); + c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const); + d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), + vrshrn_n_s64(c[1], DCT_CONST_BITS)); + d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), + vrshrn_n_s64(c[3], DCT_CONST_BITS)); + return d; +} + +// Subtract b from a, then multiply by ab_const. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { + const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]); + const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]); + int64x2_t c[4]; + int32x4x2_t d; + c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const); + c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const); + c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const); + c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const); + d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), + vrshrn_n_s64(c[1], DCT_CONST_BITS)); + d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), + vrshrn_n_s64(c[3], DCT_CONST_BITS)); + return d; +} + +// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b, + const int32_t b_const) { + int64x2_t c[4]; + int32x4x2_t d; + c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); + c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); + c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); + c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); + c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const); + c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const); + c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const); + c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const); + d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), + vrshrn_n_s64(c[1], DCT_CONST_BITS)); + d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), + vrshrn_n_s64(c[3], DCT_CONST_BITS)); + return d; +} + +// Shift the output down by 6 and add it to the destination buffer. +static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, + const int16x8_t a2, const int16x8_t a3, + const int16x8_t a4, const int16x8_t a5, + const int16x8_t a6, const int16x8_t a7, + uint8_t *b, const int b_stride) { + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + b0 = vld1_u8(b); + b += b_stride; + b1 = vld1_u8(b); + b += b_stride; + b2 = vld1_u8(b); + b += b_stride; + b3 = vld1_u8(b); + b += b_stride; + b4 = vld1_u8(b); + b += b_stride; + b5 = vld1_u8(b); + b += b_stride; + b6 = vld1_u8(b); + b += b_stride; + b7 = vld1_u8(b); + b -= (7 * b_stride); + + // c = b + (a >> 6) + c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6); + c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6); + c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6); + c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6); + c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6); + c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6); + c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6); + c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6); + + b0 = vqmovun_s16(c0); + b1 = vqmovun_s16(c1); + b2 = vqmovun_s16(c2); + b3 = vqmovun_s16(c3); + b4 = vqmovun_s16(c4); + b5 = vqmovun_s16(c5); + b6 = vqmovun_s16(c6); + b7 = vqmovun_s16(c7); + + vst1_u8(b, b0); + b += b_stride; + vst1_u8(b, b1); + b += b_stride; + vst1_u8(b, b2); + b += b_stride; + vst1_u8(b, b3); + b += b_stride; + vst1_u8(b, b4); + b += b_stride; + vst1_u8(b, b5); + b += b_stride; + vst1_u8(b, b6); + b += b_stride; + vst1_u8(b, b7); +} + +static INLINE uint8x16_t create_dcq(const int16_t dc) { + // Clip both sides and gcc may compile to assembly 'usat'. + const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc); + return vdupq_n_u8((uint8_t)t); +} + +static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, + int16x8_t *const a0, + int16x8_t *const a1) { + int16x4_t b0, b1, b2, b3; + int32x4_t c0, c1, c2, c3; + int16x8_t d0, d1; + + transpose_s16_4x4q(a0, a1); + b0 = vget_low_s16(*a0); + b1 = vget_high_s16(*a0); + b2 = vget_low_s16(*a1); + b3 = vget_high_s16(*a1); + c0 = vmull_lane_s16(b0, cospis, 2); + c2 = vmull_lane_s16(b1, cospis, 2); + c1 = vsubq_s32(c0, c2); + c0 = vaddq_s32(c0, c2); + c2 = vmull_lane_s16(b2, cospis, 3); + c3 = vmull_lane_s16(b2, cospis, 1); + c2 = vmlsl_lane_s16(c2, b3, cospis, 1); + c3 = vmlal_lane_s16(c3, b3, cospis, 3); + b0 = vrshrn_n_s32(c0, DCT_CONST_BITS); + b1 = vrshrn_n_s32(c1, DCT_CONST_BITS); + b2 = vrshrn_n_s32(c2, DCT_CONST_BITS); + b3 = vrshrn_n_s32(c3, DCT_CONST_BITS); + d0 = vcombine_s16(b0, b1); + d1 = vcombine_s16(b3, b2); + *a0 = vaddq_s16(d0, d1); + *a1 = vsubq_s16(d0, d1); +} + +static INLINE void idct8x8_12_pass1_bd8( + const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, + int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2, + int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5, + int16x4_t *const io6, int16x4_t *const io7) { + int16x4_t step1[8], step2[8]; + int32x4_t t32[2]; + + transpose_s16_4x4d(io0, io1, io2, io3); + + // stage 1 + step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3); + step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2); + step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1); + step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0); + + // stage 2 + step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2); + step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3); + step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1); + + step2[4] = vadd_s16(step1[4], step1[5]); + step2[5] = vsub_s16(step1[4], step1[5]); + step2[6] = vsub_s16(step1[7], step1[6]); + step2[7] = vadd_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vadd_s16(step2[1], step2[3]); + step1[1] = vadd_s16(step2[1], step2[2]); + step1[2] = vsub_s16(step2[1], step2[2]); + step1[3] = vsub_s16(step2[1], step2[3]); + + t32[1] = vmull_lane_s16(step2[6], cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2); + t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2); + step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + + // stage 4 + *io0 = vadd_s16(step1[0], step2[7]); + *io1 = vadd_s16(step1[1], step1[6]); + *io2 = vadd_s16(step1[2], step1[5]); + *io3 = vadd_s16(step1[3], step2[4]); + *io4 = vsub_s16(step1[3], step2[4]); + *io5 = vsub_s16(step1[2], step1[5]); + *io6 = vsub_s16(step1[1], step1[6]); + *io7 = vsub_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_pass2_bd8( + const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, + const int16x4_t input0, const int16x4_t input1, const int16x4_t input2, + const int16x4_t input3, const int16x4_t input4, const int16x4_t input5, + const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0, + int16x8_t *const output1, int16x8_t *const output2, + int16x8_t *const output3, int16x8_t *const output4, + int16x8_t *const output5, int16x8_t *const output6, + int16x8_t *const output7) { + int16x8_t in[4]; + int16x8_t step1[8], step2[8]; + int32x4_t t32[8]; + int16x4_t t16[8]; + + transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6, + input7, &in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3); + step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2); + step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1); + step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0); + + // stage 2 + step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2); + step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3); + step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1); + + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s16(step2[1], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[1], step2[3]); + + t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2); + t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + step1[5] = vcombine_s16(t16[0], t16[1]); + step1[6] = vcombine_s16(t16[2], t16[3]); + + // stage 4 + *output0 = vaddq_s16(step1[0], step2[7]); + *output1 = vaddq_s16(step1[1], step1[6]); + *output2 = vaddq_s16(step1[2], step1[5]); + *output3 = vaddq_s16(step1[3], step2[4]); + *output4 = vsubq_s16(step1[3], step2[4]); + *output5 = vsubq_s16(step1[2], step1[5]); + *output6 = vsubq_s16(step1[1], step1[6]); + *output7 = vsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io0, int16x8_t *const io1, + int16x8_t *const io2, int16x8_t *const io3, + int16x8_t *const io4, int16x8_t *const io5, + int16x8_t *const io6, + int16x8_t *const io7) { + int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, + input_7l, input_7h; + int16x4_t step1l[4], step1h[4]; + int16x8_t step1[8], step2[8]; + int32x4_t t32[8]; + int16x4_t t16[8]; + + transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + input_1l = vget_low_s16(*io1); + input_1h = vget_high_s16(*io1); + input_3l = vget_low_s16(*io3); + input_3h = vget_high_s16(*io3); + input_5l = vget_low_s16(*io5); + input_5h = vget_high_s16(*io5); + input_7l = vget_low_s16(*io7); + input_7h = vget_high_s16(*io7); + step1l[0] = vget_low_s16(*io0); + step1h[0] = vget_high_s16(*io0); + step1l[1] = vget_low_s16(*io2); + step1h[1] = vget_high_s16(*io2); + step1l[2] = vget_low_s16(*io4); + step1h[2] = vget_high_s16(*io4); + step1l[3] = vget_low_s16(*io6); + step1h[3] = vget_high_s16(*io6); + + t32[0] = vmull_lane_s16(input_1l, cospis1, 3); + t32[1] = vmull_lane_s16(input_1h, cospis1, 3); + t32[2] = vmull_lane_s16(input_3l, cospis1, 2); + t32[3] = vmull_lane_s16(input_3h, cospis1, 2); + t32[4] = vmull_lane_s16(input_3l, cospis1, 1); + t32[5] = vmull_lane_s16(input_3h, cospis1, 1); + t32[6] = vmull_lane_s16(input_1l, cospis1, 0); + t32[7] = vmull_lane_s16(input_1h, cospis1, 0); + t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0); + t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0); + t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1); + t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1); + t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2); + t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2); + t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3); + t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); + t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); + t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); + t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); + step1[4] = vcombine_s16(t16[0], t16[1]); + step1[5] = vcombine_s16(t16[2], t16[3]); + step1[6] = vcombine_s16(t16[4], t16[5]); + step1[7] = vcombine_s16(t16[6], t16[7]); + + // stage 2 + t32[2] = vmull_lane_s16(step1l[0], cospis0, 2); + t32[3] = vmull_lane_s16(step1h[0], cospis0, 2); + t32[4] = vmull_lane_s16(step1l[1], cospis0, 3); + t32[5] = vmull_lane_s16(step1h[1], cospis0, 3); + t32[6] = vmull_lane_s16(step1l[1], cospis0, 1); + t32[7] = vmull_lane_s16(step1h[1], cospis0, 1); + t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2); + t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2); + t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2); + t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2); + t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1); + t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); + t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); + t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); + t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); + t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); + t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); + step2[0] = vcombine_s16(t16[0], t16[1]); + step2[1] = vcombine_s16(t16[2], t16[3]); + step2[2] = vcombine_s16(t16[4], t16[5]); + step2[3] = vcombine_s16(t16[6], t16[7]); + + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + + t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2); + t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + step1[5] = vcombine_s16(t16[0], t16[1]); + step1[6] = vcombine_s16(t16[2], t16[3]); + + // stage 4 + *io0 = vaddq_s16(step1[0], step2[7]); + *io1 = vaddq_s16(step1[1], step1[6]); + *io2 = vaddq_s16(step1[2], step1[5]); + *io3 = vaddq_s16(step1[3], step2[4]); + *io4 = vsubq_s16(step1[3], step2[4]); + *io5 = vsubq_s16(step1[2], step1[5]); + *io6 = vsubq_s16(step1[1], step1[6]); + *io7 = vsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32, + int16x8_t *const d0, + int16x8_t *const d1) { + int16x4_t t16[4]; + + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + *d0 = vcombine_s16(t16[0], t16[1]); + *d1 = vcombine_s16(t16[2], t16[3]); +} + +static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0, + const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int32x4_t *const t32) { + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1); +} + +static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, + int16x8_t *const d1) { + int32x4_t t32[4]; + + idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); + t32[2] = vnegq_s32(t32[2]); + t32[3] = vnegq_s32(t32[3]); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, + int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2); + t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2); + t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); + t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); + t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); + t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26N_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26N_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct16x16_add_stage7(const int16x8_t *const step2, + int16x8_t *const out) { +#if CONFIG_VP9_HIGHBITDEPTH + // Use saturating add/sub to avoid overflow in 2nd pass + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +#else + out[0] = vaddq_s16(step2[0], step2[15]); + out[1] = vaddq_s16(step2[1], step2[14]); + out[2] = vaddq_s16(step2[2], step2[13]); + out[3] = vaddq_s16(step2[3], step2[12]); + out[4] = vaddq_s16(step2[4], step2[11]); + out[5] = vaddq_s16(step2[5], step2[10]); + out[6] = vaddq_s16(step2[6], step2[9]); + out[7] = vaddq_s16(step2[7], step2[8]); + out[8] = vsubq_s16(step2[7], step2[8]); + out[9] = vsubq_s16(step2[6], step2[9]); + out[10] = vsubq_s16(step2[5], step2[10]); + out[11] = vsubq_s16(step2[4], step2[11]); + out[12] = vsubq_s16(step2[3], step2[12]); + out[13] = vsubq_s16(step2[2], step2[13]); + out[14] = vsubq_s16(step2[1], step2[14]); + out[15] = vsubq_s16(step2[0], step2[15]); +#endif +} + +static INLINE void idct16x16_store_pass1(const int16x8_t *const out, + int16_t *output) { + // Save the result into output + vst1q_s16(output, out[0]); + output += 16; + vst1q_s16(output, out[1]); + output += 16; + vst1q_s16(output, out[2]); + output += 16; + vst1q_s16(output, out[3]); + output += 16; + vst1q_s16(output, out[4]); + output += 16; + vst1q_s16(output, out[5]); + output += 16; + vst1q_s16(output, out[6]); + output += 16; + vst1q_s16(output, out[7]); + output += 16; + vst1q_s16(output, out[8]); + output += 16; + vst1q_s16(output, out[9]); + output += 16; + vst1q_s16(output, out[10]); + output += 16; + vst1q_s16(output, out[11]); + output += 16; + vst1q_s16(output, out[12]); + output += 16; + vst1q_s16(output, out[13]); + output += 16; + vst1q_s16(output, out[14]); + output += 16; + vst1q_s16(output, out[15]); +} + +static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest, + const int stride) { + uint8x8_t d = vld1_u8(*dest); + uint16x8_t q; + + res = vrshrq_n_s16(res, 6); + q = vaddw_u8(vreinterpretq_u16_s16(res), d); + d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + +static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max, + uint16_t **dest, const int stride) { + uint16x8_t d = vld1q_u16(*dest); + + res = vqaddq_s16(res, vreinterpretq_s16_u16(d)); + res = vminq_s16(res, max); + d = vqshluq_n_s16(res, 0); + vst1q_u16(*dest, d); + *dest += stride; +} + +static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest, + const int stride) { + uint16x8_t d = vld1q_u16(*dest); + + res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6); + d = vmovl_u8(vqmovun_s16(res)); + vst1q_u16(*dest, d); + *dest += stride; +} + +static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a, + uint16_t *out, const int b_stride) { + highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride); + highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride); +} + +static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out, + uint16_t *dest, const int stride, + const int bd) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t o[16]; + o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6), + vrshrn_n_s32(out[0].val[1], 6)); + o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6), + vrshrn_n_s32(out[1].val[1], 6)); + o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6), + vrshrn_n_s32(out[2].val[1], 6)); + o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6), + vrshrn_n_s32(out[3].val[1], 6)); + o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6), + vrshrn_n_s32(out[4].val[1], 6)); + o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6), + vrshrn_n_s32(out[5].val[1], 6)); + o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6), + vrshrn_n_s32(out[6].val[1], 6)); + o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6), + vrshrn_n_s32(out[7].val[1], 6)); + o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6), + vrshrn_n_s32(out[8].val[1], 6)); + o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6), + vrshrn_n_s32(out[9].val[1], 6)); + o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6), + vrshrn_n_s32(out[10].val[1], 6)); + o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6), + vrshrn_n_s32(out[11].val[1], 6)); + o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6), + vrshrn_n_s32(out[12].val[1], 6)); + o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6), + vrshrn_n_s32(out[13].val[1], 6)); + o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6), + vrshrn_n_s32(out[14].val[1], 6)); + o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6), + vrshrn_n_s32(out[15].val[1], 6)); + highbd_idct16x16_add8x1(o[0], max, &dest, stride); + highbd_idct16x16_add8x1(o[1], max, &dest, stride); + highbd_idct16x16_add8x1(o[2], max, &dest, stride); + highbd_idct16x16_add8x1(o[3], max, &dest, stride); + highbd_idct16x16_add8x1(o[4], max, &dest, stride); + highbd_idct16x16_add8x1(o[5], max, &dest, stride); + highbd_idct16x16_add8x1(o[6], max, &dest, stride); + highbd_idct16x16_add8x1(o[7], max, &dest, stride); + highbd_idct16x16_add8x1(o[8], max, &dest, stride); + highbd_idct16x16_add8x1(o[9], max, &dest, stride); + highbd_idct16x16_add8x1(o[10], max, &dest, stride); + highbd_idct16x16_add8x1(o[11], max, &dest, stride); + highbd_idct16x16_add8x1(o[12], max, &dest, stride); + highbd_idct16x16_add8x1(o[13], max, &dest, stride); + highbd_idct16x16_add8x1(o[14], max, &dest, stride); + highbd_idct16x16_add8x1(o[15], max, &dest, stride); +} + +void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag); + +void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int16_t *output); + +void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input, + int16_t *const output, void *const dest, + const int stride, const int highbd_flag); + +void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, + const int stride, const int highbd_flag); + +void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output); +void vpx_idct32_16_neon(const int16_t *const input, void *const output, + const int stride, const int highbd_flag); + +void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output); +void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, + const int highbd_flag); + +#endif // VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c index 38e79ed69d..fb1fa6b681 100644 --- a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c +++ b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c @@ -17,306 +17,254 @@ //------------------------------------------------------------------------------ // DC 4x4 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int do_above, int do_left) { - uint16x4_t sum_top; - uint16x4_t sum_left; - uint16x4_t dc0; +static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) { + const uint8x8_t ref_u8 = vld1_u8(ref); + const uint16x4_t p0 = vpaddl_u8(ref_u8); + return vpadd_u16(p0, p0); +} - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - sum_top = vpadd_u16(p0, p0); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - sum_left = vpadd_u16(p0, p0); - } - - if (do_above && do_left) { - const uint16x4_t sum = vadd_u16(sum_left, sum_top); - dc0 = vrshr_n_u16(sum, 3); - } else if (do_above) { - dc0 = vrshr_n_u16(sum_top, 2); - } else if (do_left) { - dc0 = vrshr_n_u16(sum_left, 2); - } else { - dc0 = vdup_n_u16(0x80); - } - - { - const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0); - int i; - for (i = 0; i < 4; ++i) { - vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); - } +static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0); } } void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_4x4(dst, stride, above, left, 1, 1); + const uint8x8_t a = vld1_u8(above); + const uint8x8_t l = vld1_u8(left); + const uint16x8_t al = vaddl_u8(a, l); + uint16x4_t sum; + uint8x8_t dc; + sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al)); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + dc_store_4x4(dst, stride, dc); } void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_4(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); (void)above; - dc_4x4(dst, stride, NULL, left, 0, 1); + dc_store_4x4(dst, stride, dc); } void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_4(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); (void)left; - dc_4x4(dst, stride, above, NULL, 1, 0); + dc_store_4x4(dst, stride, dc); } void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_4x4(dst, stride, NULL, NULL, 0, 0); + dc_store_4x4(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 8x8 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; +static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) { + const uint8x8_t ref_u8 = vld1_u8(ref); + uint16x4_t sum = vpaddl_u8(ref_u8); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_top = vcombine_u16(p2, p2); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_left = vcombine_u16(p2, p2); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 4); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 3); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 3); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); - int i; - for (i = 0; i < 8; ++i) { - vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc)); - } +static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + vst1_u8(dst, dc_dup); } } void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_8x8(dst, stride, above, left, 1, 1); + const uint8x8_t above_u8 = vld1_u8(above); + const uint8x8_t left_u8 = vld1_u8(left); + const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8); + const uint16x8_t p0 = vpaddlq_u8(above_and_left); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + dc_store_8x8(dst, stride, dc); } void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_8(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); (void)above; - dc_8x8(dst, stride, NULL, left, 0, 1); + dc_store_8x8(dst, stride, dc); } void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_8(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); (void)left; - dc_8x8(dst, stride, above, NULL, 1, 0); + dc_store_8x8(dst, stride, dc); } void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_8x8(dst, stride, NULL, NULL, 0, 0); + dc_store_8x8(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 16x16 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; +static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) { + const uint8x16_t ref_u8 = vld1q_u8(ref); + const uint16x8_t p0 = vpaddlq_u8(ref_u8); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - if (do_above) { - const uint8x16_t A = vld1q_u8(above); // top row - const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_top = vcombine_u16(p3, p3); - } - - if (do_left) { - const uint8x16_t L = vld1q_u8(left); // left row - const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_left = vcombine_u16(p3, p3); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 5); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 4); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 4); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 16; ++i) { - vst1q_u8(dst + i * stride, dc); - } +static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); + int i; + for (i = 0; i < 16; ++i, dst += stride) { + vst1q_u8(dst, dc_dup); } } void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_16x16(dst, stride, above, left, 1, 1); + const uint8x16_t ref0 = vld1q_u8(above); + const uint8x16_t ref1 = vld1q_u8(left); + const uint16x8_t p0 = vpaddlq_u8(ref0); + const uint16x8_t p1 = vpaddlq_u8(ref1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + dc_store_16x16(dst, stride, dc); } void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_16(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); (void)above; - dc_16x16(dst, stride, NULL, left, 0, 1); + dc_store_16x16(dst, stride, dc); } void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_16(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); (void)left; - dc_16x16(dst, stride, above, NULL, 1, 0); + dc_store_16x16(dst, stride, dc); } void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_16x16(dst, stride, NULL, NULL, 0, 0); + dc_store_16x16(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 32x32 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; +static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { + const uint8x16x2_t r = vld2q_u8(ref); + const uint16x8_t p0 = vpaddlq_u8(r.val[0]); + const uint16x8_t p1 = vpaddlq_u8(r.val[1]); + const uint16x8_t p2 = vaddq_u16(p0, p1); + uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - if (do_above) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t A1 = vld1q_u8(above + 16); - const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top - const uint16x8_t p1 = vpaddlq_u8(A1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_top = vcombine_u16(p5, p5); - } +static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + uint8x16x2_t dc_dup; + int i; + dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0); - if (do_left) { - const uint8x16_t L0 = vld1q_u8(left); // left row - const uint8x16_t L1 = vld1q_u8(left + 16); - const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left - const uint16x8_t p1 = vpaddlq_u8(L1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_left = vcombine_u16(p5, p5); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 6); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 5); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 5); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 32; ++i) { - vst1q_u8(dst + i * stride, dc); - vst1q_u8(dst + i * stride + 16, dc); - } + for (i = 0; i < 32; ++i, dst += stride) { + vst2q_u8(dst, dc_dup); } } void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_32x32(dst, stride, above, left, 1, 1); + const uint8x16x2_t a = vld2q_u8(above); + const uint8x16x2_t l = vld2q_u8(left); + const uint16x8_t pa0 = vpaddlq_u8(a.val[0]); + const uint16x8_t pl0 = vpaddlq_u8(l.val[0]); + const uint16x8_t pa1 = vpaddlq_u8(a.val[1]); + const uint16x8_t pl1 = vpaddlq_u8(l.val[1]); + const uint16x8_t pa = vaddq_u16(pa0, pa1); + const uint16x8_t pl = vaddq_u16(pl0, pl1); + const uint16x8_t pal = vaddq_u16(pa, pl); + uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6)); + dc_store_32x32(dst, stride, dc); } void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_32(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); (void)above; - dc_32x32(dst, stride, NULL, left, 0, 1); + dc_store_32x32(dst, stride, dc); } void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_32(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); (void)left; - dc_32x32(dst, stride, above, NULL, 1, 0); + dc_store_32x32(dst, stride, dc); } void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_32x32(dst, stride, NULL, NULL, 0, 0); + dc_store_32x32(dst, stride, dc); } // ----------------------------------------------------------------------------- void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row - const uint64x1_t A1 = vshr_n_u64(A0, 8); - const uint64x1_t A2 = vshr_n_u64(A0, 16); - const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); + const uint8x8_t ABCDEFGH = vld1_u8(above); + const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8); + const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16); const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); @@ -331,485 +279,764 @@ void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); - dst[3 * stride + 3] = above[7]; + vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7); +} + +static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t above_right, uint8x8_t *row) { + *row = vext_u8(*row, above_right, 1); + vst1_u8(*dst, *row); + *dst += stride; } void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; - static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; - const uint8x8_t sh_12345677 = vld1_u8(shuffle1); - const uint8x8_t sh_23456777 = vld1_u8(shuffle2); - const uint8x8_t A0 = vld1_u8(above); // top row - const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); - const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); + const uint8x8_t A0 = vld1_u8(above); + const uint8x8_t above_right = vdup_lane_u8(A0, 7); + const uint8x8_t A1 = vext_u8(A0, above_right, 1); + const uint8x8_t A2 = vext_u8(A0, above_right, 2); const uint8x8_t avg1 = vhadd_u8(A0, A2); uint8x8_t row = vrhadd_u8(avg1, A1); - int i; (void)left; - for (i = 0; i < 7; ++i) { - vst1_u8(dst + i * stride, row); - row = vtbl1_u8(row, sh_12345677); - } - vst1_u8(dst + i * stride, row); + + vst1_u8(dst, row); + dst += stride; + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + vst1_u8(dst, above_right); +} + +static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride, + const uint8x16_t above_right, uint8x16_t *row) { + *row = vextq_u8(*row, above_right, 1); + vst1q_u8(*dst, *row); + *dst += stride; } void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t above_right = vld1q_dup_u8(above + 15); + const uint8x16_t A0 = vld1q_u8(above); + const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7); const uint8x16_t A1 = vextq_u8(A0, above_right, 1); const uint8x16_t A2 = vextq_u8(A0, above_right, 2); const uint8x16_t avg1 = vhaddq_u8(A0, A2); uint8x16_t row = vrhaddq_u8(avg1, A1); + (void)left; + + vst1q_u8(dst, row); + dst += stride; + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + vst1q_u8(dst, above_right); +} + +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t A0_0 = vld1q_u8(above); + const uint8x16_t A0_1 = vld1q_u8(above + 16); + const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7); + const uint8x16_t A1_0 = vld1q_u8(above + 1); + const uint8x16_t A1_1 = vld1q_u8(above + 17); + const uint8x16_t A2_0 = vld1q_u8(above + 2); + const uint8x16_t A2_1 = vld1q_u8(above + 18); + const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0); + const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1); + uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0); + uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1); int i; (void)left; - for (i = 0; i < 15; ++i) { - vst1q_u8(dst + i * stride, row); - row = vextq_u8(row, above_right, 1); + + vst1q_u8(dst, row_0); + dst += 16; + vst1q_u8(dst, row_1); + dst += stride - 16; + + for (i = 0; i < 30; ++i) { + row_0 = vextq_u8(row_0, row_1, 1); + row_1 = vextq_u8(row_1, above_right, 1); + vst1q_u8(dst, row_0); + dst += 16; + vst1q_u8(dst, row_1); + dst += stride - 16; } - vst1q_u8(dst + i * stride, row); + + vst1q_u8(dst, above_right); + dst += 16; + vst1q_u8(dst, row_1); } // ----------------------------------------------------------------------------- void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t XABCD_u8 = vld1_u8(above - 1); - const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); - const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); - const uint32x2_t zero = vdup_n_u32(0); - const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); - const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); - const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); - const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); - const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); - const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); - const uint8_t D = vget_lane_u8(XABCD_u8, 4); - const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); - const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); - const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); - const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint8x8_t XA0123 = vld1_u8(above - 1); + const uint8x8_t L0123 = vld1_u8(left); + const uint8x8_t L3210 = vrev64_u8(L0123); + const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4); + const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5); + const uint8x8_t L10XA0123_ = + vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8)); + const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012); + const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r3 = vreinterpret_u32_u8(avg2); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); + vst1_lane_u32((uint32_t *)dst, r0, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r1, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r2, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r3, 0); } +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XA0123456 = vld1_u8(above - 1); + const uint8x8_t A01234567 = vld1_u8(above); + const uint8x8_t A1234567_ = vld1_u8(above + 1); + const uint8x8_t L01234567 = vld1_u8(left); + const uint8x8_t L76543210 = vrev64_u8(L01234567); + const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1); + const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2); + const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456); + const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567); + const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_); + const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_); + const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567); + const uint8x8_t row_0 = vget_low_u8(row); + const uint8x8_t row_1 = vget_high_u8(row); + const uint8x8_t r0 = vext_u8(row_0, row_1, 7); + const uint8x8_t r1 = vext_u8(row_0, row_1, 6); + const uint8x8_t r2 = vext_u8(row_0, row_1, 5); + const uint8x8_t r3 = vext_u8(row_0, row_1, 4); + const uint8x8_t r4 = vext_u8(row_0, row_1, 3); + const uint8x8_t r5 = vext_u8(row_0, row_1, 2); + const uint8x8_t r6 = vext_u8(row_0, row_1, 1); + + vst1_u8(dst, r0); + dst += stride; + vst1_u8(dst, r1); + dst += stride; + vst1_u8(dst, r2); + dst += stride; + vst1_u8(dst, r3); + dst += stride; + vst1_u8(dst, r4); + dst += stride; + vst1_u8(dst, r5); + dst += stride; + vst1_u8(dst, r6); + dst += stride; + vst1_u8(dst, row_0); +} + +static INLINE void d135_store_16x8( + uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0, + const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3, + const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6, + const uint8x16_t row_7) { + vst1q_u8(*dst, row_0); + *dst += stride; + vst1q_u8(*dst, row_1); + *dst += stride; + vst1q_u8(*dst, row_2); + *dst += stride; + vst1q_u8(*dst, row_3); + *dst += stride; + vst1q_u8(*dst, row_4); + *dst += stride; + vst1q_u8(*dst, row_5); + *dst += stride; + vst1q_u8(*dst, row_6); + *dst += stride; + vst1q_u8(*dst, row_7); + *dst += stride; +} + +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1); + const uint8x16_t A0123456789abcdef = vld1q_u8(above); + const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1); + const uint8x16_t L0123456789abcdef = vld1q_u8(left); + const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef)); + const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef)); + const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210); + const uint8x16_t Ledcba9876543210X = + vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1); + const uint8x16_t Ldcba9876543210XA0 = + vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2); + const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0); + const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_); + const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X); + const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef); + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); + const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14); + const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13); + const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12); + const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11); + const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10); + const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9); + const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1)); + const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7); + const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6); + const uint8x16_t r_a = vextq_u8(row_0, row_1, 5); + const uint8x16_t r_b = vextq_u8(row_0, row_1, 4); + const uint8x16_t r_c = vextq_u8(row_0, row_1, 3); + const uint8x16_t r_d = vextq_u8(row_0, row_1, 2); + const uint8x16_t r_e = vextq_u8(row_0, row_1, 1); + + d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7); + d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0); +} + +static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride, + const uint8x16_t row_0, + const uint8x16_t row_1, + const uint8x16_t row_2) { + uint8_t *dst2 = *dst; + vst1q_u8(dst2, row_1); + dst2 += 16; + vst1q_u8(dst2, row_2); + dst2 += 16 * stride - 16; + vst1q_u8(dst2, row_0); + dst2 += 16; + vst1q_u8(dst2, row_1); + *dst += stride; +} + +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16); + const uint8x16_t LU0123456789abcdef = vld1q_u8(left); + const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef)); + const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef)); + const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef)); + const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef)); + const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210); + const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210); + const uint8x16_t LLedcba9876543210Uf = + vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1); + const uint8x16_t LLdcba9876543210Ufe = + vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2); + const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe); + const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf); + + const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1); + const uint8x16_t LUedcba9876543210X = + vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1); + const uint8x16_t LUdcba9876543210XA0 = + vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2); + const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0); + const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X); + + const uint8x16_t AL0123456789abcdef = vld1q_u8(above); + const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1); + const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15); + const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16); + const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17); + const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg); + const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef); + const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_); + const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef); + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + d135_store_32x2(&dst, stride, row_0, row_1, row_2); +} + +// ----------------------------------------------------------------------------- + #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint32_t d = *(const uint32_t *)above; int i; - uint32x2_t d0u32 = vdup_n_u32(0); (void)left; - d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); - for (i = 0; i < 4; i++, dst += stride) - vst1_lane_u32((uint32_t *)dst, d0u32, 0); + for (i = 0; i < 4; i++, dst += stride) { + *(uint32_t *)dst = d; + } } void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t d = vld1_u8(above); int i; - uint8x8_t d0u8 = vdup_n_u8(0); (void)left; - d0u8 = vld1_u8(above); - for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); + for (i = 0; i < 8; i++, dst += stride) { + vst1_u8(dst, d); + } } void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vld1q_u8(above); int i; - uint8x16_t q0u8 = vdupq_n_u8(0); (void)left; - q0u8 = vld1q_u8(above); - for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); + for (i = 0; i < 16; i++, dst += stride) { + vst1q_u8(dst, d); + } } void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); (void)left; - q0u8 = vld1q_u8(above); - q1u8 = vld1q_u8(above + 16); - for (i = 0; i < 32; i++, dst += stride) { - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); + for (i = 0; i < 32; i++) { + // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8. + // clang-3.8 unrolled the loop fully with no filler so the cause is likely + // the latency of the instruction. + vst1q_u8(dst, d0); + dst += 16; + vst1q_u8(dst, d1); + dst += stride - 16; } } +// ----------------------------------------------------------------------------- + void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d1u32 = vdup_n_u32(0); + const uint32x2_t zero = vdup_n_u32(0); + const uint8x8_t left_u8 = + vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0)); + uint8x8_t d; (void)above; - d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); } void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint64x1_t d1u64 = vdup_n_u64(0); + const uint8x8_t left_u8 = vld1_u8(left); + uint8x8_t d; (void)above; - d1u64 = vld1_u64((const uint64_t *)left); + d = vdup_lane_u8(left_u8, 0); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 1); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 2); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 3); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 4); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 5); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 6); + vst1_u8(dst, d); + dst += stride; + d = vdup_lane_u8(left_u8, 7); + vst1_u8(dst, d); +} - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); - vst1_u8(dst, d0u8); +static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t left) { + const uint8x16_t row_0 = vdupq_lane_u8(left, 0); + const uint8x16_t row_1 = vdupq_lane_u8(left, 1); + const uint8x16_t row_2 = vdupq_lane_u8(left, 2); + const uint8x16_t row_3 = vdupq_lane_u8(left, 3); + const uint8x16_t row_4 = vdupq_lane_u8(left, 4); + const uint8x16_t row_5 = vdupq_lane_u8(left, 5); + const uint8x16_t row_6 = vdupq_lane_u8(left, 6); + const uint8x16_t row_7 = vdupq_lane_u8(left, 7); + + vst1q_u8(*dst, row_0); + *dst += stride; + vst1q_u8(*dst, row_1); + *dst += stride; + vst1q_u8(*dst, row_2); + *dst += stride; + vst1q_u8(*dst, row_3); + *dst += stride; + vst1q_u8(*dst, row_4); + *dst += stride; + vst1q_u8(*dst, row_5); + *dst += stride; + vst1q_u8(*dst, row_6); + *dst += stride; + vst1q_u8(*dst, row_7); + *dst += stride; } void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); + const uint8x16_t left_u8q = vld1q_u8(left); (void)above; - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - dst += stride; - } + h_store_16x8(&dst, stride, vget_low_u8(left_u8q)); + h_store_16x8(&dst, stride, vget_high_u8(left_u8q)); +} + +static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t left) { + const uint8x16_t row_0 = vdupq_lane_u8(left, 0); + const uint8x16_t row_1 = vdupq_lane_u8(left, 1); + const uint8x16_t row_2 = vdupq_lane_u8(left, 2); + const uint8x16_t row_3 = vdupq_lane_u8(left, 3); + const uint8x16_t row_4 = vdupq_lane_u8(left, 4); + const uint8x16_t row_5 = vdupq_lane_u8(left, 5); + const uint8x16_t row_6 = vdupq_lane_u8(left, 6); + const uint8x16_t row_7 = vdupq_lane_u8(left, 7); + + vst1q_u8(*dst, row_0); // Note clang-3.8 produced poor code w/vst2q_u8 + *dst += 16; + vst1q_u8(*dst, row_0); + *dst += stride - 16; + vst1q_u8(*dst, row_1); + *dst += 16; + vst1q_u8(*dst, row_1); + *dst += stride - 16; + vst1q_u8(*dst, row_2); + *dst += 16; + vst1q_u8(*dst, row_2); + *dst += stride - 16; + vst1q_u8(*dst, row_3); + *dst += 16; + vst1q_u8(*dst, row_3); + *dst += stride - 16; + vst1q_u8(*dst, row_4); + *dst += 16; + vst1q_u8(*dst, row_4); + *dst += stride - 16; + vst1q_u8(*dst, row_5); + *dst += 16; + vst1q_u8(*dst, row_5); + *dst += stride - 16; + vst1q_u8(*dst, row_6); + *dst += 16; + vst1q_u8(*dst, row_6); + *dst += stride - 16; + vst1q_u8(*dst, row_7); + *dst += 16; + vst1q_u8(*dst, row_7); + *dst += stride - 16; } void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); + int i; (void)above; - for (k = 0; k < 2; k++, left += 16) { - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - } + for (i = 0; i < 2; i++, left += 16) { + const uint8x16_t left_u8 = vld1q_u8(left); + h_store_32x8(&dst, stride, vget_low_u8(left_u8)); + h_store_32x8(&dst, stride, vget_high_u8(left_u8)); } } +// ----------------------------------------------------------------------------- + +static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) { + return vreinterpretq_s16_u16(vmovl_u8(v)); +} + void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int i; - uint16x8_t q1u16, q3u16; - int16x8_t q1s16; - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d2u32 = vdup_n_u32(0); + const uint8x8_t top_left = vld1_dup_u8(above - 1); + const uint8x8_t left_u8 = vld1_u8(left); + const uint8x8_t above_u8 = vld1_u8(above); + const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8)); + int16x8_t sub, sum; + uint32x2_t d; - d0u8 = vld1_dup_u8(above - 1); - d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); - q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); - for (i = 0; i < 4; i++, dst += stride) { - q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); - d0u8 = vqmovun_s16(q1s16); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - } + sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left)); + // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8. + sub = vreinterpretq_s16_s64( + vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0)); + + sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1)); + sum = vaddq_s16(sum, sub); + d = vreinterpret_u32_u8(vqmovun_s16(sum)); + vst1_lane_u32((uint32_t *)dst, d, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, d, 1); + dst += stride; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3)); + sum = vaddq_s16(sum, sub); + d = vreinterpret_u32_u8(vqmovun_s16(sum)); + vst1_lane_u32((uint32_t *)dst, d, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, d, 1); +} + +static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub) { + const int16x8_t sum = vaddq_s16(left_dup, sub); + const uint8x8_t d = vqmovun_s16(sum); + vst1_u8(*dst, d); + *dst += stride; } void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j; - uint16x8_t q0u16, q3u16, q10u16; - int16x8_t q0s16; - uint16x4_t d20u16; - uint8x8_t d0u8, d2u8, d30u8; + const uint8x8_t top_left = vld1_dup_u8(above - 1); + const uint8x8_t above_u8 = vld1_u8(above); + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left)); + int16x4_t left_s16d = vget_low_s16(left_s16q); + int i; - d0u8 = vld1_dup_u8(above - 1); - d30u8 = vld1_u8(left); - d2u8 = vld1_u8(above); - q10u16 = vmovl_u8(d30u8); - q3u16 = vsubl_u8(d2u8, d0u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + int16x8_t left_dup; + + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_8_kernel(&dst, stride, left_dup, sub); } } +static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1) { + const int16x8_t sum0 = vaddq_s16(left_dup, sub0); + const int16x8_t sum1 = vaddq_s16(left_dup, sub1); + const uint8x8_t d0 = vqmovun_s16(sum0); + const uint8x8_t d1 = vqmovun_s16(sum1); + vst1_u8(*dst, d0); + *dst += 8; + vst1_u8(*dst, d1); + *dst += stride - 8; +} + void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; - uint8x16_t q0u8, q1u8; - int16x8_t q0s16, q1s16, q8s16, q11s16; - uint16x4_t d20u16; - uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; + const uint8x16_t top_left = vld1q_dup_u8(above - 1); + const uint8x16_t above_u8 = vld1q_u8(above); + const int16x8_t sub0 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left))); + const int16x8_t sub1 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left))); + int16x8_t left_dup; + int i; - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - for (k = 0; k < 2; k++, left += 8) { - d18u8 = vld1_u8(left); - q10u16 = vmovl_u8(d18u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; + for (i = 0; i < 2; i++, left += 8) { + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + const int16x4_t left_low = vget_low_s16(left_s16q); + const int16x4_t left_high = vget_high_s16(left_s16q); - q0u16 = vdupq_lane_u16(d20u16, 2); - q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - } + left_dup = vdupq_lane_s16(left_low, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + + left_dup = vdupq_lane_s16(left_high, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); } } +static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t sub2, + const int16x8_t sub3) { + const int16x8_t sum0 = vaddq_s16(left_dup, sub0); + const int16x8_t sum1 = vaddq_s16(left_dup, sub1); + const int16x8_t sum2 = vaddq_s16(left_dup, sub2); + const int16x8_t sum3 = vaddq_s16(left_dup, sub3); + const uint8x8_t d0 = vqmovun_s16(sum0); + const uint8x8_t d1 = vqmovun_s16(sum1); + const uint8x8_t d2 = vqmovun_s16(sum2); + const uint8x8_t d3 = vqmovun_s16(sum3); + + vst1q_u8(*dst, vcombine_u8(d0, d1)); + *dst += 16; + vst1q_u8(*dst, vcombine_u8(d2, d3)); + *dst += stride - 16; +} + void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; - uint8x16_t q0u8, q1u8, q2u8; - int16x8_t q12s16, q13s16, q14s16, q15s16; - uint16x4_t d6u16; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; + const uint8x16_t top_left = vld1q_dup_u8(above - 1); + const uint8x16_t above_low = vld1q_u8(above); + const uint8x16_t above_high = vld1q_u8(above + 16); + const int16x8_t sub0 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left))); + const int16x8_t sub1 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left))); + const int16x8_t sub2 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left))); + const int16x8_t sub3 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left))); + int16x8_t left_dup; + int i, j; - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u8 = vld1q_u8(above + 16); - q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); - for (k = 0; k < 4; k++, left += 8) { - d26u8 = vld1_u8(left); - q3u16 = vmovl_u8(d26u8); - d6u16 = vget_low_u16(q3u16); - for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { - q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; + for (j = 0; j < 4; j++, left += 8) { + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); } } } diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm index 5a8fdd6aff..730c40de0e 100644 --- a/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm +++ b/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm @@ -8,192 +8,659 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vpx_lpf_horizontal_4_dual_neon| + EXPORT |vpx_lpf_horizontal_16_neon| + EXPORT |vpx_lpf_horizontal_16_dual_neon| + EXPORT |vpx_lpf_vertical_16_neon| + EXPORT |vpx_lpf_vertical_16_dual_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, -; const uint8_t *blimit0, -; const uint8_t *limit0, -; const uint8_t *thresh0, -; const uint8_t *blimit1, -; const uint8_t *limit1, -; const uint8_t *thresh1) +; void mb_lpf_horizontal_edge(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; r0 uint8_t *s, -; r1 int p, -; r2 const uint8_t *blimit0, -; r3 const uint8_t *limit0, -; sp const uint8_t *thresh0, -; sp+4 const uint8_t *blimit1, -; sp+8 const uint8_t *limit1, -; sp+12 const uint8_t *thresh1, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; r12 int count +|mb_lpf_horizontal_edge| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh -|vpx_lpf_horizontal_4_dual_neon| PROC - push {lr} +h_count + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh - ldr r12, [sp, #4] ; load thresh0 - vld1.8 {d0}, [r2] ; load blimit0 to first half q - vld1.8 {d2}, [r3] ; load limit0 to first half q + sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines - add r1, r1, r1 ; double pitch - ldr r2, [sp, #8] ; load blimit1 + vld1.u8 {d0}, [r8@64], r1 ; p7 + vld1.u8 {d1}, [r8@64], r1 ; p6 + vld1.u8 {d2}, [r8@64], r1 ; p5 + vld1.u8 {d3}, [r8@64], r1 ; p4 + vld1.u8 {d4}, [r8@64], r1 ; p3 + vld1.u8 {d5}, [r8@64], r1 ; p2 + vld1.u8 {d6}, [r8@64], r1 ; p1 + vld1.u8 {d7}, [r8@64], r1 ; p0 + vld1.u8 {d8}, [r8@64], r1 ; q0 + vld1.u8 {d9}, [r8@64], r1 ; q1 + vld1.u8 {d10}, [r8@64], r1 ; q2 + vld1.u8 {d11}, [r8@64], r1 ; q3 + vld1.u8 {d12}, [r8@64], r1 ; q4 + vld1.u8 {d13}, [r8@64], r1 ; q5 + vld1.u8 {d14}, [r8@64], r1 ; q6 + vld1.u8 {d15}, [r8@64], r1 ; q7 - vld1.8 {d4}, [r12] ; load thresh0 to first half q + bl vpx_wide_mbfilter_neon - ldr r3, [sp, #12] ; load limit1 - ldr r12, [sp, #16] ; load thresh1 - vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + tst r7, #1 + beq h_mbfilter - sub r2, r0, r1, lsl #1 ; s[-4 * p] + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, r1, lsl #1 - vld1.8 {d3}, [r3] ; load limit1 to 2nd half q - vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + vst1.u8 {d25}, [r8@64], r1 ; store op1 + vst1.u8 {d24}, [r8@64], r1 ; store op0 + vst1.u8 {d23}, [r8@64], r1 ; store oq0 + vst1.u8 {d26}, [r8@64], r1 ; store oq1 - vpush {d8-d15} ; save neon registers + b h_next - add r3, r2, r1, lsr #1 ; s[-3 * p] +h_mbfilter + tst r7, #2 + beq h_wide_mbfilter - vld1.u8 {q3}, [r2@64], r1 ; p3 - vld1.u8 {q4}, [r3@64], r1 ; p2 - vld1.u8 {q5}, [r2@64], r1 ; p1 - vld1.u8 {q6}, [r3@64], r1 ; p0 - vld1.u8 {q7}, [r2@64], r1 ; q0 - vld1.u8 {q8}, [r3@64], r1 ; q1 - vld1.u8 {q9}, [r2@64] ; q2 - vld1.u8 {q10}, [r3@64] ; q3 + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 + vst1.u8 {d18}, [r8@64], r1 ; store op2 + vst1.u8 {d19}, [r8@64], r1 ; store op1 + vst1.u8 {d20}, [r8@64], r1 ; store op0 + vst1.u8 {d21}, [r8@64], r1 ; store oq0 + vst1.u8 {d22}, [r8@64], r1 ; store oq1 + vst1.u8 {d23}, [r8@64], r1 ; store oq2 - bl vpx_loop_filter_neon_16 + b h_next - vst1.u8 {q5}, [r2@64], r1 ; store op1 - vst1.u8 {q6}, [r3@64], r1 ; store op0 - vst1.u8 {q7}, [r2@64], r1 ; store oq0 - vst1.u8 {q8}, [r3@64], r1 ; store oq1 +h_wide_mbfilter + sub r8, r0, r1, lsl #3 + add r8, r8, r1 - vpop {d8-d15} ; restore neon registers + vst1.u8 {d16}, [r8@64], r1 ; store op6 + vst1.u8 {d24}, [r8@64], r1 ; store op5 + vst1.u8 {d25}, [r8@64], r1 ; store op4 + vst1.u8 {d26}, [r8@64], r1 ; store op3 + vst1.u8 {d27}, [r8@64], r1 ; store op2 + vst1.u8 {d18}, [r8@64], r1 ; store op1 + vst1.u8 {d19}, [r8@64], r1 ; store op0 + vst1.u8 {d20}, [r8@64], r1 ; store oq0 + vst1.u8 {d21}, [r8@64], r1 ; store oq1 + vst1.u8 {d22}, [r8@64], r1 ; store oq2 + vst1.u8 {d23}, [r8@64], r1 ; store oq3 + vst1.u8 {d1}, [r8@64], r1 ; store oq4 + vst1.u8 {d2}, [r8@64], r1 ; store oq5 + vst1.u8 {d3}, [r8@64], r1 ; store oq6 - pop {pc} - ENDP ; |vpx_lpf_horizontal_4_dual_neon| +h_next + add r0, r0, #8 + subs r12, r12, #1 + bne h_count -; void vpx_loop_filter_neon_16(); + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |mb_lpf_horizontal_edge| + +; void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_16_neon| PROC + mov r12, #1 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_16_neon| + +; void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_16_dual_neon| PROC + mov r12, #2 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_16_dual_neon| + +; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, const uint8_t *thresh, +; int count) { +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; r12 int count +|mb_lpf_vertical_edge_w| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + +v_count + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8@64], r1 + vld1.8 {d8}, [r0@64], r1 + vld1.8 {d1}, [r8@64], r1 + vld1.8 {d9}, [r0@64], r1 + vld1.8 {d2}, [r8@64], r1 + vld1.8 {d10}, [r0@64], r1 + vld1.8 {d3}, [r8@64], r1 + vld1.8 {d11}, [r0@64], r1 + vld1.8 {d4}, [r8@64], r1 + vld1.8 {d12}, [r0@64], r1 + vld1.8 {d5}, [r8@64], r1 + vld1.8 {d13}, [r0@64], r1 + vld1.8 {d6}, [r8@64], r1 + vld1.8 {d14}, [r0@64], r1 + vld1.8 {d7}, [r8@64], r1 + vld1.8 {d15}, [r0@64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r0], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r0], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r0], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r0], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r0], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r0], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r0], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r0], r1 + add r0, #2 + + b v_next + +v_mbfilter + tst r7, #2 + beq v_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_next + +v_wide_mbfilter + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8@64], r1 + vst1.8 {d20}, [r0@64], r1 + vst1.8 {d16}, [r8@64], r1 + vst1.8 {d21}, [r0@64], r1 + vst1.8 {d24}, [r8@64], r1 + vst1.8 {d22}, [r0@64], r1 + vst1.8 {d25}, [r8@64], r1 + vst1.8 {d23}, [r0@64], r1 + vst1.8 {d26}, [r8@64], r1 + vst1.8 {d1}, [r0@64], r1 + vst1.8 {d27}, [r8@64], r1 + vst1.8 {d2}, [r0@64], r1 + vst1.8 {d18}, [r8@64], r1 + vst1.8 {d3}, [r0@64], r1 + vst1.8 {d19}, [r8@64], r1 + vst1.8 {d15}, [r0@64], r1 + +v_next + subs r12, #1 + bne v_count + + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |mb_lpf_vertical_edge_w| + +; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_vertical_16_neon| PROC + mov r12, #1 + b mb_lpf_vertical_edge_w + ENDP ; |vpx_lpf_vertical_16_neon| + +; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_vertical_16_dual_neon| PROC + mov r12, #2 + b mb_lpf_vertical_edge_w + ENDP ; |vpx_lpf_vertical_16_dual_neon| + +; void vpx_wide_mbfilter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. This function uses -; registers d8-d15, so the calling function must save those registers. +; necessary load, transpose (if necessary) and store. ; -; r0-r3, r12 PRESERVE -; q0 blimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -; -; Outputs: -; q5 op1 -; q6 op0 -; q7 oq0 -; q8 oq1 -|vpx_loop_filter_neon_16| PROC +; r0-r3 PRESERVE +; d16 blimit +; d17 limit +; d18 thresh +; d0 p7 +; d1 p6 +; d2 p5 +; d3 p4 +; d4 p3 +; d5 p2 +; d6 p1 +; d7 p0 +; d8 q0 +; d9 q1 +; d10 q2 +; d11 q3 +; d12 q4 +; d13 q5 +; d14 q6 +; d15 q7 +|vpx_wide_mbfilter_neon| PROC + mov r7, #0 ; filter_mask - vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) - vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) - vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) - vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) - vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) - vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) + vabd.u8 d19, d4, d5 ; abs(p3 - p2) + vabd.u8 d20, d5, d6 ; abs(p2 - p1) + vabd.u8 d21, d6, d7 ; abs(p1 - p0) + vabd.u8 d22, d9, d8 ; abs(q1 - q0) + vabd.u8 d23, d10, d9 ; abs(q2 - q1) + vabd.u8 d24, d11, d10 ; abs(q3 - q2) ; only compare the largest value to limit - vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) - vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) + vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 - vabd.u8 q9, q6, q7 ; abs(p0 - q0) + vabd.u8 d24, d7, d8 ; abs(p0 - q0) - vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) + vmax.u8 d19, d19, d23 - vmov.u8 q10, #0x80 + vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 - vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) + ; abs () > limit + vcge.u8 d19, d17, d19 - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) + ; flatmask4 + vabd.u8 d25, d7, d5 ; abs(p0 - p2) + vabd.u8 d26, d8, d10 ; abs(q0 - q2) + vabd.u8 d27, d4, d7 ; abs(p3 - p0) + vabd.u8 d28, d11, d8 ; abs(q3 - q0) - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 - veor q7, q7, q10 ; qs0 + vshr.u8 d23, d23, #1 ; a = a / 2 + vqadd.u8 d24, d24, d23 ; a = b + a - vcge.u8 q15, q1, q15 ; abs(m11) > limit + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 + vcge.u8 d20, d30, d20 ; flat - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a + vand d19, d19, d24 ; mask - veor q8, q8, q10 ; qs1 + ; hevmask + vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 ; hev - vmov.u16 q4, #3 + vand d16, d20, d19 ; flat && mask + vmov r5, r6, d16 - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 + ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 ; abs(p4 - p0) + vabd.u8 d23, d12, d8 ; abs(q4 - q0) + vabd.u8 d24, d7, d2 ; abs(p0 - p5) + vabd.u8 d25, d8, d13 ; abs(q0 - q5) + vabd.u8 d26, d1, d7 ; abs(p6 - p0) + vabd.u8 d27, d14, d8 ; abs(q6 - q0) + vabd.u8 d28, d0, d7 ; abs(p7 - p0) + vabd.u8 d29, d15, d8 ; abs(q7 - q0) - vcge.u8 q9, q0, q9 ; a > blimit + ; only compare the largest value to thresh + vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) - vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; hev + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 + vcge.u8 d18, d30, d23 ; flat2 - vand q1, q1, q14 ; filter &= hev - vand q15, q15, q9 ; mask + vmov.u8 d22, #0x80 - vmov.u8 q4, #3 + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #1 ; Only do filter branch - vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) - vaddw.s8 q11, q11, d3 + vand d17, d18, d16 ; flat2 && flat && mask + vmov r5, r6, d17 - vmov.u8 q9, #4 + ; mbfilter() function + + ; filter() function + ; convert to signed + veor d23, d8, d22 ; qs0 + veor d24, d7, d22 ; ps0 + veor d25, d6, d22 ; ps1 + veor d26, d9, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 ; ( qs0 - ps0) + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + vand d29, d29, d21 ; filter &= hev + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; filter &= mask + vqmovn.s16 d28, q15 - vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) - vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) - vshr.s8 q2, q2, #3 ; filter2 >>= 3 - vshr.s8 q1, q1, #3 ; filter1 >>= 3 + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) - vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) - ; outer tap adjustments - vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 ; filter &= ~hev - veor q7, q0, q10 ; *oq0 = u^0x80 + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) - vbic q1, q1, q14 ; filter &= ~hev + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d23, d23, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) + tst r7, #1 + bxne lr - veor q6, q11, q10 ; *op0 = u^0x80 - veor q5, q13, q10 ; *op1 = u^0x80 - veor q8, q12, q10 ; *oq1 = u^0x80 + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + + ; mbfilter flat && mask branch + ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's + ; and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 + vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 ; r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 ; r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 ; r_op0 + + vsubw.u8 q15, d4 ; oq0 = op0 - p3 + vsubw.u8 q15, d7 ; oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 ; r_oq0 + + vsubw.u8 q15, d5 ; oq1 = oq0 - p2 + vsubw.u8 q15, d8 ; oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 ; r_oq1 + + vsubw.u8 q15, d6 ; oq2 = oq0 - p1 + vsubw.u8 q15, d9 ; oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 ; r_oq2 + + ; Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + ; wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 ; w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 ; w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 ; w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 ; w_op3 + + vaddw.u8 q15, q14, d5 ; op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 ; op2 += q4 + vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 ; w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 ; op1 += p1 + vaddw.u8 q15, d13 ; op1 += q5 + vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 ; w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 ; op0 += p0 + vaddw.u8 q15, d14 ; op0 += q6 + vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 ; w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 ; oq0 += q0 + vaddw.u8 q15, d15 ; oq0 += q7 + vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 ; w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 ; oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 ; oq1 += q7 + vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 ; w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 ; w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 ; w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 ; w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 ; w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 ; w_oq6 + vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) bx lr - ENDP ; |vpx_loop_filter_neon_16| + ENDP ; |vpx_wide_mbfilter_neon| END diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm index 9371158984..907e918380 100644 --- a/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm +++ b/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm @@ -10,6 +10,8 @@ EXPORT |vpx_lpf_horizontal_4_neon| EXPORT |vpx_lpf_vertical_4_neon| + EXPORT |vpx_lpf_horizontal_4_dual_neon| + EXPORT |vpx_lpf_vertical_4_dual_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 @@ -53,7 +55,7 @@ sub r2, r2, r1, lsl #1 sub r3, r3, r1, lsl #1 - bl vpx_loop_filter_neon + bl filter4_8 vst1.u8 {d4}, [r2@64], r1 ; store op1 vst1.u8 {d5}, [r3@64], r1 ; store op0 @@ -113,7 +115,7 @@ vtrn.8 d7, d16 vtrn.8 d17, d18 - bl vpx_loop_filter_neon + bl filter4_8 sub r0, r0, #2 @@ -130,7 +132,7 @@ pop {pc} ENDP ; |vpx_lpf_vertical_4_neon| -; void vpx_loop_filter_neon(); +; void filter4_8(); ; This is a helper function for the loopfilters. The invidual functions do the ; necessary load, transpose (if necessary) and store. The function does not use ; registers d8-d15. @@ -154,7 +156,7 @@ ; d5 op0 ; d6 oq0 ; d7 oq1 -|vpx_loop_filter_neon| PROC +|filter4_8| PROC ; filter_mask vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) @@ -244,6 +246,304 @@ veor d7, d20, d18 ; *oq1 = u^0x80 bx lr - ENDP ; |vpx_loop_filter_neon| + ENDP ; |filter4_8| + +;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|vpx_lpf_horizontal_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + add r1, r1, r1 ; double pitch + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, r1, lsl #1 ; s[-4 * p] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + add r3, r2, r1, lsr #1 ; s[-3 * p] + + vld1.u8 {q3}, [r2@64], r1 ; p3 + vld1.u8 {q4}, [r3@64], r1 ; p2 + vld1.u8 {q5}, [r2@64], r1 ; p1 + vld1.u8 {q6}, [r3@64], r1 ; p0 + vld1.u8 {q7}, [r2@64], r1 ; q0 + vld1.u8 {q8}, [r3@64], r1 ; q1 + vld1.u8 {q9}, [r2@64] ; q2 + vld1.u8 {q10}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl filter4_16 + + vst1.u8 {q5}, [r2@64], r1 ; store op1 + vst1.u8 {q6}, [r3@64], r1 ; store op0 + vst1.u8 {q7}, [r2@64], r1 ; store oq0 + vst1.u8 {q8}, [r3@64], r1 ; store oq1 + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |vpx_lpf_horizontal_4_dual_neon| + +;void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|vpx_lpf_vertical_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, #4 ; s[-4] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + vld1.u8 {d6}, [r2], r1 ; 00 01 02 03 04 05 06 07 + vld1.u8 {d8}, [r2], r1 ; 10 11 12 13 14 15 16 17 + vld1.u8 {d10}, [r2], r1 ; 20 21 22 23 24 25 26 27 + vld1.u8 {d12}, [r2], r1 ; 30 31 32 33 34 35 36 37 + vld1.u8 {d14}, [r2], r1 ; 40 41 42 43 44 45 46 47 + vld1.u8 {d16}, [r2], r1 ; 50 51 52 53 54 55 56 57 + vld1.u8 {d18}, [r2], r1 ; 60 61 62 63 64 65 66 67 + vld1.u8 {d20}, [r2], r1 ; 70 71 72 73 74 75 76 77 + vld1.u8 {d7}, [r2], r1 ; 80 81 82 83 84 85 86 87 + vld1.u8 {d9}, [r2], r1 ; 90 91 92 93 94 95 96 97 + vld1.u8 {d11}, [r2], r1 ; A0 A1 A2 A3 A4 A5 A6 A7 + vld1.u8 {d13}, [r2], r1 ; B0 B1 B2 B3 B4 B5 B6 B7 + vld1.u8 {d15}, [r2], r1 ; C0 C1 C2 C3 C4 C5 C6 C7 + vld1.u8 {d17}, [r2], r1 ; D0 D1 D2 D3 D4 D5 D6 D7 + vld1.u8 {d19}, [r2], r1 ; E0 E1 E2 E3 E4 E5 E6 E7 + vld1.u8 {d21}, [r2] ; F0 F1 F2 F3 F4 F5 F6 F7 + + vtrn.8 q3, q4 ; q3 : 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96 + ; q4 : 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97 + vtrn.8 q5, q6 ; q5 : 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6 + ; q6 : 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7 + vtrn.8 q7, q8 ; q7 : 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6 + ; q8 : 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7 + vtrn.8 q9, q10 ; q9 : 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6 + ; q10: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7 + + vtrn.16 q3, q5 ; q3 : 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4 + ; q5 : 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6 + vtrn.16 q4, q6 ; q4 : 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5 + ; q6 : 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7 + vtrn.16 q7, q9 ; q7 : 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4 + ; q9 : 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6 + vtrn.16 q8, q10 ; q8 : 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5 + ; q10: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7 + + vtrn.32 q3, q7 ; q3 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + ; q7 : 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 + vtrn.32 q5, q9 ; q5 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + ; q9 : 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 + vtrn.32 q4, q8 ; q4 : 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 + ; q8 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + vtrn.32 q6, q10 ; q6 : 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 + ; q10: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + + bl filter4_16 + + sub r0, #2 + + vmov d0, d11 + vmov d1, d13 + vmov d2, d15 + vmov d3, d17 + vmov d11, d12 + vmov d12, d14 + vmov d13, d16 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r0], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r0], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r0], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r0] + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |vpx_lpf_vertical_4_dual_neon| + +; void filter4_16(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. This function uses +; registers d8-d15, so the calling function must save those registers. +; +; r0-r3, r12 PRESERVE +; q0 blimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +; +; Outputs: +; q5 op1 +; q6 op0 +; q7 oq0 +; q8 oq1 +|filter4_16| PROC + + ; filter_mask + vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) + vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) + vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) + vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) + vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) + vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) + vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) + + vmov.u8 q10, #0x80 + + vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + veor q7, q7, q10 ; qs0 + + vcge.u8 q15, q1, q15 ; abs(m11) > limit + + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u16 q4, #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; a > blimit + + vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; hev + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; filter &= hev + vand q15, q15, q9 ; mask + + vmov.u8 q4, #3 + + vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; filter &= mask + + vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) + vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) + vshr.s8 q2, q2, #3 ; filter2 >>= 3 + vshr.s8 q1, q1, #3 ; filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) + vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 + + veor q7, q0, q10 ; *oq0 = u^0x80 + + vbic q1, q1, q14 ; filter &= ~hev + + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) + + veor q6, q11, q10 ; *op0 = u^0x80 + veor q5, q13, q10 ; *op1 = u^0x80 + veor q8, q12, q10 ; *oq1 = u^0x80 + + bx lr + ENDP ; |filter4_16| END diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm deleted file mode 100644 index 5279ecfb7f..0000000000 --- a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm +++ /dev/null @@ -1,666 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vpx_lpf_horizontal_edge_8_neon| - EXPORT |vpx_lpf_horizontal_edge_16_neon| - EXPORT |vpx_lpf_vertical_16_neon| - EXPORT |vpx_lpf_vertical_16_dual_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; void mb_lpf_horizontal_edge(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -; r12 int count -|mb_lpf_horizontal_edge| PROC - push {r4-r8, lr} - vpush {d8-d15} - ldr r4, [sp, #88] ; load thresh - -h_count - vld1.8 {d16[]}, [r2] ; load *blimit - vld1.8 {d17[]}, [r3] ; load *limit - vld1.8 {d18[]}, [r4] ; load *thresh - - sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines - - vld1.u8 {d0}, [r8@64], r1 ; p7 - vld1.u8 {d1}, [r8@64], r1 ; p6 - vld1.u8 {d2}, [r8@64], r1 ; p5 - vld1.u8 {d3}, [r8@64], r1 ; p4 - vld1.u8 {d4}, [r8@64], r1 ; p3 - vld1.u8 {d5}, [r8@64], r1 ; p2 - vld1.u8 {d6}, [r8@64], r1 ; p1 - vld1.u8 {d7}, [r8@64], r1 ; p0 - vld1.u8 {d8}, [r8@64], r1 ; q0 - vld1.u8 {d9}, [r8@64], r1 ; q1 - vld1.u8 {d10}, [r8@64], r1 ; q2 - vld1.u8 {d11}, [r8@64], r1 ; q3 - vld1.u8 {d12}, [r8@64], r1 ; q4 - vld1.u8 {d13}, [r8@64], r1 ; q5 - vld1.u8 {d14}, [r8@64], r1 ; q6 - vld1.u8 {d15}, [r8@64], r1 ; q7 - - bl vpx_wide_mbfilter_neon - - tst r7, #1 - beq h_mbfilter - - ; flat && mask were not set for any of the channels. Just store the values - ; from filter. - sub r8, r0, r1, lsl #1 - - vst1.u8 {d25}, [r8@64], r1 ; store op1 - vst1.u8 {d24}, [r8@64], r1 ; store op0 - vst1.u8 {d23}, [r8@64], r1 ; store oq0 - vst1.u8 {d26}, [r8@64], r1 ; store oq1 - - b h_next - -h_mbfilter - tst r7, #2 - beq h_wide_mbfilter - - ; flat2 was not set for any of the channels. Just store the values from - ; mbfilter. - sub r8, r0, r1, lsl #1 - sub r8, r8, r1 - - vst1.u8 {d18}, [r8@64], r1 ; store op2 - vst1.u8 {d19}, [r8@64], r1 ; store op1 - vst1.u8 {d20}, [r8@64], r1 ; store op0 - vst1.u8 {d21}, [r8@64], r1 ; store oq0 - vst1.u8 {d22}, [r8@64], r1 ; store oq1 - vst1.u8 {d23}, [r8@64], r1 ; store oq2 - - b h_next - -h_wide_mbfilter - sub r8, r0, r1, lsl #3 - add r8, r8, r1 - - vst1.u8 {d16}, [r8@64], r1 ; store op6 - vst1.u8 {d24}, [r8@64], r1 ; store op5 - vst1.u8 {d25}, [r8@64], r1 ; store op4 - vst1.u8 {d26}, [r8@64], r1 ; store op3 - vst1.u8 {d27}, [r8@64], r1 ; store op2 - vst1.u8 {d18}, [r8@64], r1 ; store op1 - vst1.u8 {d19}, [r8@64], r1 ; store op0 - vst1.u8 {d20}, [r8@64], r1 ; store oq0 - vst1.u8 {d21}, [r8@64], r1 ; store oq1 - vst1.u8 {d22}, [r8@64], r1 ; store oq2 - vst1.u8 {d23}, [r8@64], r1 ; store oq3 - vst1.u8 {d1}, [r8@64], r1 ; store oq4 - vst1.u8 {d2}, [r8@64], r1 ; store oq5 - vst1.u8 {d3}, [r8@64], r1 ; store oq6 - -h_next - add r0, r0, #8 - subs r12, r12, #1 - bne h_count - - vpop {d8-d15} - pop {r4-r8, pc} - - ENDP ; |mb_lpf_horizontal_edge| - -; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh -|vpx_lpf_horizontal_edge_8_neon| PROC - mov r12, #1 - b mb_lpf_horizontal_edge - ENDP ; |vpx_lpf_horizontal_edge_8_neon| - -; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh -|vpx_lpf_horizontal_edge_16_neon| PROC - mov r12, #2 - b mb_lpf_horizontal_edge - ENDP ; |vpx_lpf_horizontal_edge_16_neon| - -; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, -; const uint8_t *limit, const uint8_t *thresh, -; int count) { -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -; r12 int count -|mb_lpf_vertical_edge_w| PROC - push {r4-r8, lr} - vpush {d8-d15} - ldr r4, [sp, #88] ; load thresh - -v_count - vld1.8 {d16[]}, [r2] ; load *blimit - vld1.8 {d17[]}, [r3] ; load *limit - vld1.8 {d18[]}, [r4] ; load *thresh - - sub r8, r0, #8 - - vld1.8 {d0}, [r8@64], r1 - vld1.8 {d8}, [r0@64], r1 - vld1.8 {d1}, [r8@64], r1 - vld1.8 {d9}, [r0@64], r1 - vld1.8 {d2}, [r8@64], r1 - vld1.8 {d10}, [r0@64], r1 - vld1.8 {d3}, [r8@64], r1 - vld1.8 {d11}, [r0@64], r1 - vld1.8 {d4}, [r8@64], r1 - vld1.8 {d12}, [r0@64], r1 - vld1.8 {d5}, [r8@64], r1 - vld1.8 {d13}, [r0@64], r1 - vld1.8 {d6}, [r8@64], r1 - vld1.8 {d14}, [r0@64], r1 - vld1.8 {d7}, [r8@64], r1 - vld1.8 {d15}, [r0@64], r1 - - sub r0, r0, r1, lsl #3 - - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - vtrn.8 d0, d1 - vtrn.8 d2, d3 - vtrn.8 d4, d5 - vtrn.8 d6, d7 - - vtrn.8 d8, d9 - vtrn.8 d10, d11 - vtrn.8 d12, d13 - vtrn.8 d14, d15 - - bl vpx_wide_mbfilter_neon - - tst r7, #1 - beq v_mbfilter - - ; flat && mask were not set for any of the channels. Just store the values - ; from filter. - sub r0, #2 - - vswp d23, d25 - - vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r0], r1 - vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r0], r1 - vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r0], r1 - vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r0], r1 - vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r0], r1 - vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r0], r1 - vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r0], r1 - vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r0], r1 - add r0, #2 - - b v_next - -v_mbfilter - tst r7, #2 - beq v_wide_mbfilter - - ; flat2 was not set for any of the channels. Just store the values from - ; mbfilter. - sub r8, r0, #3 - - vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 - vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 - vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 - vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 - vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 - vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 - vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 - vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 - vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 - vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 - vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 - vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 - vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 - vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 - vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 - vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 - - b v_next - -v_wide_mbfilter - sub r8, r0, #8 - - vtrn.32 d0, d26 - vtrn.32 d16, d27 - vtrn.32 d24, d18 - vtrn.32 d25, d19 - - vtrn.16 d0, d24 - vtrn.16 d16, d25 - vtrn.16 d26, d18 - vtrn.16 d27, d19 - - vtrn.8 d0, d16 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - vtrn.8 d18, d19 - - vtrn.32 d20, d1 - vtrn.32 d21, d2 - vtrn.32 d22, d3 - vtrn.32 d23, d15 - - vtrn.16 d20, d22 - vtrn.16 d21, d23 - vtrn.16 d1, d3 - vtrn.16 d2, d15 - - vtrn.8 d20, d21 - vtrn.8 d22, d23 - vtrn.8 d1, d2 - vtrn.8 d3, d15 - - vst1.8 {d0}, [r8@64], r1 - vst1.8 {d20}, [r0@64], r1 - vst1.8 {d16}, [r8@64], r1 - vst1.8 {d21}, [r0@64], r1 - vst1.8 {d24}, [r8@64], r1 - vst1.8 {d22}, [r0@64], r1 - vst1.8 {d25}, [r8@64], r1 - vst1.8 {d23}, [r0@64], r1 - vst1.8 {d26}, [r8@64], r1 - vst1.8 {d1}, [r0@64], r1 - vst1.8 {d27}, [r8@64], r1 - vst1.8 {d2}, [r0@64], r1 - vst1.8 {d18}, [r8@64], r1 - vst1.8 {d3}, [r0@64], r1 - vst1.8 {d19}, [r8@64], r1 - vst1.8 {d15}, [r0@64], r1 - -v_next - subs r12, #1 - bne v_count - - vpop {d8-d15} - pop {r4-r8, pc} - - ENDP ; |mb_lpf_vertical_edge_w| - -; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, -; const uint8_t *limit, const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh -|vpx_lpf_vertical_16_neon| PROC - mov r12, #1 - b mb_lpf_vertical_edge_w - ENDP ; |vpx_lpf_vertical_16_neon| - -; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh -|vpx_lpf_vertical_16_dual_neon| PROC - mov r12, #2 - b mb_lpf_vertical_edge_w - ENDP ; |vpx_lpf_vertical_16_dual_neon| - -; void vpx_wide_mbfilter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. -; -; r0-r3 PRESERVE -; d16 blimit -; d17 limit -; d18 thresh -; d0 p7 -; d1 p6 -; d2 p5 -; d3 p4 -; d4 p3 -; d5 p2 -; d6 p1 -; d7 p0 -; d8 q0 -; d9 q1 -; d10 q2 -; d11 q3 -; d12 q4 -; d13 q5 -; d14 q6 -; d15 q7 -|vpx_wide_mbfilter_neon| PROC - mov r7, #0 - - ; filter_mask - vabd.u8 d19, d4, d5 ; abs(p3 - p2) - vabd.u8 d20, d5, d6 ; abs(p2 - p1) - vabd.u8 d21, d6, d7 ; abs(p1 - p0) - vabd.u8 d22, d9, d8 ; abs(q1 - q0) - vabd.u8 d23, d10, d9 ; abs(q2 - q1) - vabd.u8 d24, d11, d10 ; abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) - vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) - vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) - vmax.u8 d19, d19, d20 - - vabd.u8 d24, d7, d8 ; abs(p0 - q0) - - vmax.u8 d19, d19, d23 - - vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) - vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 - - ; abs () > limit - vcge.u8 d19, d17, d19 - - ; flatmask4 - vabd.u8 d25, d7, d5 ; abs(p0 - p2) - vabd.u8 d26, d8, d10 ; abs(q0 - q2) - vabd.u8 d27, d4, d7 ; abs(p3 - p0) - vabd.u8 d28, d11, d8 ; abs(q3 - q0) - - ; only compare the largest value to thresh - vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) - vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) - vmax.u8 d25, d25, d26 - vmax.u8 d20, d20, d25 - - vshr.u8 d23, d23, #1 ; a = a / 2 - vqadd.u8 d24, d24, d23 ; a = b + a - - vmov.u8 d30, #1 - vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 - - vcge.u8 d20, d30, d20 ; flat - - vand d19, d19, d24 ; mask - - ; hevmask - vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 - vorr d21, d21, d22 ; hev - - vand d16, d20, d19 ; flat && mask - vmov r5, r6, d16 - - ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) - vabd.u8 d22, d3, d7 ; abs(p4 - p0) - vabd.u8 d23, d12, d8 ; abs(q4 - q0) - vabd.u8 d24, d7, d2 ; abs(p0 - p5) - vabd.u8 d25, d8, d13 ; abs(q0 - q5) - vabd.u8 d26, d1, d7 ; abs(p6 - p0) - vabd.u8 d27, d14, d8 ; abs(q6 - q0) - vabd.u8 d28, d0, d7 ; abs(p7 - p0) - vabd.u8 d29, d15, d8 ; abs(q7 - q0) - - ; only compare the largest value to thresh - vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) - vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) - vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) - vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) - - vmax.u8 d26, d22, d23 - vmax.u8 d27, d24, d25 - vmax.u8 d23, d26, d27 - - vcge.u8 d18, d30, d23 ; flat2 - - vmov.u8 d22, #0x80 - - orrs r5, r5, r6 ; Check for 0 - orreq r7, r7, #1 ; Only do filter branch - - vand d17, d18, d16 ; flat2 && flat && mask - vmov r5, r6, d17 - - ; mbfilter() function - - ; filter() function - ; convert to signed - veor d23, d8, d22 ; qs0 - veor d24, d7, d22 ; ps0 - veor d25, d6, d22 ; ps1 - veor d26, d9, d22 ; qs1 - - vmov.u8 d27, #3 - - vsub.s8 d28, d23, d24 ; ( qs0 - ps0) - vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) - vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) - vand d29, d29, d21 ; filter &= hev - vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) - vmov.u8 d29, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d28, q15 - - vand d28, d28, d19 ; filter &= mask - - vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) - vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) - vshr.s8 d30, d30, #3 ; filter2 >>= 3 - vshr.s8 d29, d29, #3 ; filter1 >>= 3 - - - vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) - vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) - - ; outer tap adjustments: ++filter1 >> 1 - vrshr.s8 d29, d29, #1 - vbic d29, d29, d21 ; filter &= ~hev - - vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) - vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) - - veor d24, d24, d22 ; *f_op0 = u^0x80 - veor d23, d23, d22 ; *f_oq0 = u^0x80 - veor d25, d25, d22 ; *f_op1 = u^0x80 - veor d26, d26, d22 ; *f_oq1 = u^0x80 - - tst r7, #1 - bxne lr - - orrs r5, r5, r6 ; Check for 0 - orreq r7, r7, #2 ; Only do mbfilter branch - - ; mbfilter flat && mask branch - ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's - ; and using vibt on the q's? - vmov.u8 d29, #2 - vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 - vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 - vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 - vaddl.u8 q10, d4, d5 - vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 - vaddl.u8 q14, d6, d9 - vqrshrn.u16 d18, q15, #3 ; r_op2 - - vsub.i16 q15, q10 - vaddl.u8 q10, d4, d6 - vadd.i16 q15, q14 - vaddl.u8 q14, d7, d10 - vqrshrn.u16 d19, q15, #3 ; r_op1 - - vsub.i16 q15, q10 - vadd.i16 q15, q14 - vaddl.u8 q14, d8, d11 - vqrshrn.u16 d20, q15, #3 ; r_op0 - - vsubw.u8 q15, d4 ; oq0 = op0 - p3 - vsubw.u8 q15, d7 ; oq0 -= p0 - vadd.i16 q15, q14 - vaddl.u8 q14, d9, d11 - vqrshrn.u16 d21, q15, #3 ; r_oq0 - - vsubw.u8 q15, d5 ; oq1 = oq0 - p2 - vsubw.u8 q15, d8 ; oq1 -= q0 - vadd.i16 q15, q14 - vaddl.u8 q14, d10, d11 - vqrshrn.u16 d22, q15, #3 ; r_oq1 - - vsubw.u8 q15, d6 ; oq2 = oq0 - p1 - vsubw.u8 q15, d9 ; oq2 -= q1 - vadd.i16 q15, q14 - vqrshrn.u16 d27, q15, #3 ; r_oq2 - - ; Filter does not set op2 or oq2, so use p2 and q2. - vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) - vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) - vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) - vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) - vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) - - vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) - vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) - - tst r7, #2 - bxne lr - - ; wide_mbfilter flat2 && flat && mask branch - vmov.u8 d16, #7 - vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 - vaddl.u8 q12, d2, d3 - vaddl.u8 q13, d4, d5 - vaddl.u8 q14, d1, d6 - vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 - vadd.i16 q12, q13 - vadd.i16 q15, q14 - vaddl.u8 q14, d2, d9 - vadd.i16 q15, q12 - vaddl.u8 q12, d0, d1 - vaddw.u8 q15, d1 - vaddl.u8 q13, d0, d2 - vadd.i16 q14, q15, q14 - vqrshrn.u16 d16, q15, #4 ; w_op6 - - vsub.i16 q15, q14, q12 - vaddl.u8 q14, d3, d10 - vqrshrn.u16 d24, q15, #4 ; w_op5 - - vsub.i16 q15, q13 - vaddl.u8 q13, d0, d3 - vadd.i16 q15, q14 - vaddl.u8 q14, d4, d11 - vqrshrn.u16 d25, q15, #4 ; w_op4 - - vadd.i16 q15, q14 - vaddl.u8 q14, d0, d4 - vsub.i16 q15, q13 - vsub.i16 q14, q15, q14 - vqrshrn.u16 d26, q15, #4 ; w_op3 - - vaddw.u8 q15, q14, d5 ; op2 += p2 - vaddl.u8 q14, d0, d5 - vaddw.u8 q15, d12 ; op2 += q4 - vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) - vqrshrn.u16 d27, q15, #4 ; w_op2 - - vsub.i16 q15, q14 - vaddl.u8 q14, d0, d6 - vaddw.u8 q15, d6 ; op1 += p1 - vaddw.u8 q15, d13 ; op1 += q5 - vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) - vqrshrn.u16 d18, q15, #4 ; w_op1 - - vsub.i16 q15, q14 - vaddl.u8 q14, d0, d7 - vaddw.u8 q15, d7 ; op0 += p0 - vaddw.u8 q15, d14 ; op0 += q6 - vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) - vqrshrn.u16 d19, q15, #4 ; w_op0 - - vsub.i16 q15, q14 - vaddl.u8 q14, d1, d8 - vaddw.u8 q15, d8 ; oq0 += q0 - vaddw.u8 q15, d15 ; oq0 += q7 - vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) - vqrshrn.u16 d20, q15, #4 ; w_oq0 - - vsub.i16 q15, q14 - vaddl.u8 q14, d2, d9 - vaddw.u8 q15, d9 ; oq1 += q1 - vaddl.u8 q4, d10, d15 - vaddw.u8 q15, d15 ; oq1 += q7 - vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) - vqrshrn.u16 d21, q15, #4 ; w_oq1 - - vsub.i16 q15, q14 - vaddl.u8 q14, d3, d10 - vadd.i16 q15, q4 - vaddl.u8 q4, d11, d15 - vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) - vqrshrn.u16 d22, q15, #4 ; w_oq2 - - vsub.i16 q15, q14 - vaddl.u8 q14, d4, d11 - vadd.i16 q15, q4 - vaddl.u8 q4, d12, d15 - vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) - vqrshrn.u16 d23, q15, #4 ; w_oq3 - - vsub.i16 q15, q14 - vaddl.u8 q14, d5, d12 - vadd.i16 q15, q4 - vaddl.u8 q4, d13, d15 - vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) - vqrshrn.u16 d1, q15, #4 ; w_oq4 - - vsub.i16 q15, q14 - vaddl.u8 q14, d6, d13 - vadd.i16 q15, q4 - vaddl.u8 q4, d14, d15 - vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) - vqrshrn.u16 d2, q15, #4 ; w_oq5 - - vsub.i16 q15, q14 - vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) - vadd.i16 q15, q4 - vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) - vqrshrn.u16 d3, q15, #4 ; w_oq6 - vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) - vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) - vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) - - bx lr - ENDP ; |vpx_wide_mbfilter_neon| - - END diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.c b/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.c deleted file mode 100644 index f95267472c..0000000000 --- a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.c +++ /dev/null @@ -1,1093 +0,0 @@ -/* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/arm/transpose_neon.h" - -// For all the static inline functions, the functions ending with '_8' process -// 8 samples in a bunch, and the functions ending with '_16' process 16 samples -// in a bunch. - -#define FUN_LOAD_THRESH(w, r) \ - static INLINE void load_thresh_##w( \ - const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \ - uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec, \ - uint8x##w##_t *thresh_vec) { \ - *blimit_vec = vld1##r##dup_u8(blimit); \ - *limit_vec = vld1##r##dup_u8(limit); \ - *thresh_vec = vld1##r##dup_u8(thresh); \ - } - -FUN_LOAD_THRESH(8, _) // load_thresh_8 -FUN_LOAD_THRESH(16, q_) // load_thresh_16 -#undef FUN_LOAD_THRESH - -static INLINE void load_thresh_8_dual( - const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, - uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) { - *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1)); - *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1)); - *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1)); -} - -// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a -// pixel. When used to control filter branches, we only detect whether it is all -// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status. -// flat equals 0 if and only if flat_status equals 0. -// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true -// because each mask occupies more than 1 bit.) -static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) { - return vget_lane_u32( - vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0); -} - -// Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel. -// When used to control filter branches, we only detect whether it is all 0s or -// all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so -// we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel. -// Then we pairwise add flat to a 32-bit long number flat_status. -// flat equals 0 if and only if flat_status equals 0. -// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true -// because each mask occupies more than 1 bit.) -static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) { - const uint8x8_t flat_4bit = - vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4)); - return calc_flat_status_8(flat_4bit); -} - -#define FUN_FILTER_HEV_MASK4(w, r) \ - static INLINE uint8x##w##_t filter_hev_mask4_##w( \ - const uint8x##w##_t limit, const uint8x##w##_t blimit, \ - const uint8x##w##_t thresh, const uint8x##w##_t p3, \ - const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ - const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ - const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \ - uint8x##w##_t max, t0, t1; \ - \ - max = vabd##r##u8(p1, p0); \ - max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \ - *hev = vcgt##r##u8(max, thresh); \ - *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \ - *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \ - *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \ - *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \ - t0 = vabd##r##u8(p0, q0); \ - t1 = vabd##r##u8(p1, q1); \ - t0 = vqadd##r##u8(t0, t0); \ - t1 = vshr##r##n_u8(t1, 1); \ - t0 = vqadd##r##u8(t0, t1); \ - *mask = vcle##r##u8(*mask, limit); \ - t0 = vcle##r##u8(t0, blimit); \ - *mask = vand##r##u8(*mask, t0); \ - \ - return max; \ - } - -FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8 -FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16 -#undef FUN_FILTER_HEV_MASK4 - -#define FUN_FILTER_FLAT_HEV_MASK(w, r) \ - static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \ - const uint8x##w##_t limit, const uint8x##w##_t blimit, \ - const uint8x##w##_t thresh, const uint8x##w##_t p3, \ - const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ - const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ - const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \ - uint8x##w##_t *hev) { \ - uint8x##w##_t max, mask; \ - \ - max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \ - q2, q3, hev, &mask); \ - *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \ - *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \ - *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \ - *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \ - *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */ \ - *flat = vand##r##u8(*flat, mask); \ - *flat_status = calc_flat_status_##w(*flat); \ - \ - return mask; \ - } - -FUN_FILTER_FLAT_HEV_MASK(8, _) // filter_flat_hev_mask_8 -FUN_FILTER_FLAT_HEV_MASK(16, q_) // filter_flat_hev_mask_16 -#undef FUN_FILTER_FLAT_HEV_MASK - -#define FUN_FLAT_MASK5(w, r) \ - static INLINE uint8x##w##_t flat_mask5_##w( \ - const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ - const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ - const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ - const uint8x##w##_t q4, const uint8x##w##_t flat, \ - uint32_t *flat2_status) { \ - uint8x##w##_t flat2 = vabd##r##u8(p4, p0); \ - flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0)); \ - flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0)); \ - flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0)); \ - flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0)); \ - flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0)); \ - flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0)); \ - flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0)); \ - flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1)); \ - flat2 = vand##r##u8(flat2, flat); \ - *flat2_status = calc_flat_status_##w(flat2); \ - \ - return flat2; \ - } - -FUN_FLAT_MASK5(8, _) // flat_mask5_8 -FUN_FLAT_MASK5(16, q_) // flat_mask5_16 -#undef FUN_FLAT_MASK5 - -#define FUN_FLIP_SIGN(w, r) \ - static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \ - const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80); \ - return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit)); \ - } - -FUN_FLIP_SIGN(8, _) // flip_sign_8 -FUN_FLIP_SIGN(16, q_) // flip_sign_16 -#undef FUN_FLIP_SIGN - -#define FUN_FLIP_SIGN_BACK(w, r) \ - static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \ - const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \ - return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \ - } - -FUN_FLIP_SIGN_BACK(8, _) // flip_sign_back_8 -FUN_FLIP_SIGN_BACK(16, q_) // flip_sign_back_16 -#undef FUN_FLIP_SIGN_BACK - -static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1, - const uint8x8_t add0, const uint8x8_t add1, - uint16x8_t *sum) { - *sum = vsubw_u8(*sum, sub0); - *sum = vsubw_u8(*sum, sub1); - *sum = vaddw_u8(*sum, add0); - *sum = vaddw_u8(*sum, add1); -} - -static INLINE void filter_update_16(const uint8x16_t sub0, - const uint8x16_t sub1, - const uint8x16_t add0, - const uint8x16_t add1, uint16x8_t *sum0, - uint16x8_t *sum1) { - *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0)); - *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0)); - *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1)); - *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1)); - *sum0 = vaddw_u8(*sum0, vget_low_u8(add0)); - *sum1 = vaddw_u8(*sum1, vget_high_u8(add0)); - *sum0 = vaddw_u8(*sum0, vget_low_u8(add1)); - *sum1 = vaddw_u8(*sum1, vget_high_u8(add1)); -} - -static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0, - const uint8x8_t sub1, - const uint8x8_t add0, - const uint8x8_t add1, - uint16x8_t *sum) { - filter_update_8(sub0, sub1, add0, add1, sum); - return vrshrn_n_u16(*sum, 3); -} - -static INLINE uint8x16_t calc_7_tap_filter_16_kernel( - const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0, - const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) { - filter_update_16(sub0, sub1, add0, add1, sum0, sum1); - return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3)); -} - -static INLINE uint8x8_t apply_15_tap_filter_8_kernel( - const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1, - const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in, - uint16x8_t *sum) { - filter_update_8(sub0, sub1, add0, add1, sum); - return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in); -} - -static INLINE uint8x16_t apply_15_tap_filter_16_kernel( - const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1, - const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in, - uint16x8_t *sum0, uint16x8_t *sum1) { - uint8x16_t t; - filter_update_16(sub0, sub1, add0, add1, sum0, sum1); - t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4)); - return vbslq_u8(flat, t, in); -} - -// 7-tap filter [1, 1, 1, 2, 1, 1, 1] -static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2, - const uint8x8_t p1, const uint8x8_t p0, - const uint8x8_t q0, const uint8x8_t q1, - const uint8x8_t q2, const uint8x8_t q3, - uint8x8_t *op2, uint8x8_t *op1, - uint8x8_t *op0, uint8x8_t *oq0, - uint8x8_t *oq1, uint8x8_t *oq2) { - uint16x8_t sum; - sum = vaddl_u8(p3, p3); // 2*p3 - sum = vaddw_u8(sum, p3); // 3*p3 - sum = vaddw_u8(sum, p2); // 3*p3+p2 - sum = vaddw_u8(sum, p2); // 3*p3+2*p2 - sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1 - sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0 - sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0 - *op2 = vrshrn_n_u16(sum, 3); - *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum); - *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum); - *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum); - *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum); - *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum); -} - -static INLINE void calc_7_tap_filter_16( - const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1, - const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, - const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1, - uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) { - uint16x8_t sum0, sum1; - sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3)); // 2*p3 - sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3)); // 2*p3 - sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 3*p3 - sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 3*p3 - sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+p2 - sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+p2 - sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+2*p2 - sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+2*p2 - sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 3*p3+2*p2+p1 - sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 3*p3+2*p2+p1 - sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 3*p3+2*p2+p1+p0 - sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 3*p3+2*p2+p1+p0 - sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 3*p3+2*p2+p1+p0+q0 - sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 3*p3+2*p2+p1+p0+q0 - *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3)); - *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1); - *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1); - *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1); - *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1); - *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1); -} - -#define FUN_APPLY_7_TAP_FILTER(w, r) \ - static INLINE void apply_7_tap_filter_##w( \ - const uint8x##w##_t flat, const uint8x##w##_t p3, \ - const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ - const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ - const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1, \ - uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1, \ - uint8x##w##_t *oq2) { \ - uint8x##w##_t tp1, tp0, tq0, tq1; \ - calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, \ - &tq0, &tq1, oq2); \ - *op2 = vbsl##r##u8(flat, *op2, p2); \ - *op1 = vbsl##r##u8(flat, tp1, *op1); \ - *op0 = vbsl##r##u8(flat, tp0, *op0); \ - *oq0 = vbsl##r##u8(flat, tq0, *oq0); \ - *oq1 = vbsl##r##u8(flat, tq1, *oq1); \ - *oq2 = vbsl##r##u8(flat, *oq2, q2); \ - } - -FUN_APPLY_7_TAP_FILTER(8, _) // apply_7_tap_filter_8 -FUN_APPLY_7_TAP_FILTER(16, q_) // apply_7_tap_filter_16 -#undef FUN_APPLY_7_TAP_FILTER - -// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] -static INLINE void apply_15_tap_filter_8( - const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6, - const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3, - const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0, - const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2, - const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5, - const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5, - uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1, - uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2, - uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) { - uint16x8_t sum; - sum = vshll_n_u8(p7, 3); // 8*p7 - sum = vsubw_u8(sum, p7); // 7*p7 - sum = vaddw_u8(sum, p6); // 7*p7+p6 - sum = vaddw_u8(sum, p6); // 7*p7+2*p6 - sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5 - sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4 - sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3 - sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2 - sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1 - sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 - sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 - *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6); - *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum); - *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum); - *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum); - *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum); - *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum); - *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum); - *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum); - *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum); - *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum); - *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum); - *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum); - *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum); - *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum); -} - -static INLINE void apply_15_tap_filter_16( - const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6, - const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3, - const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, - const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5, - const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5, - uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1, - uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2, - uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) { - uint16x8_t sum0, sum1; - uint8x16_t t; - sum0 = vshll_n_u8(vget_low_u8(p7), 3); // 8*p7 - sum1 = vshll_n_u8(vget_high_u8(p7), 3); // 8*p7 - sum0 = vsubw_u8(sum0, vget_low_u8(p7)); // 7*p7 - sum1 = vsubw_u8(sum1, vget_high_u8(p7)); // 7*p7 - sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+p6 - sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+p6 - sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+2*p6 - sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+2*p6 - sum0 = vaddw_u8(sum0, vget_low_u8(p5)); // 7*p7+2*p6+p5 - sum1 = vaddw_u8(sum1, vget_high_u8(p5)); // 7*p7+2*p6+p5 - sum0 = vaddw_u8(sum0, vget_low_u8(p4)); // 7*p7+2*p6+p5+p4 - sum1 = vaddw_u8(sum1, vget_high_u8(p4)); // 7*p7+2*p6+p5+p4 - sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 7*p7+2*p6+p5+p4+p3 - sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 7*p7+2*p6+p5+p4+p3 - sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2 - sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2 - sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1 - sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1 - sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 - sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 - sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 - sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 - t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4)); - *op6 = vbslq_u8(flat2, t, p6); - *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1); - *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1); - *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1); - *op2 = - apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1); - *op1 = - apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1); - *op0 = - apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1); - *oq0 = - apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1); - *oq1 = - apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1); - *oq2 = - apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1); - *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1); - *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1); - *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1); - *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1); -} - -#define FUN_FILTER4(w, r) \ - static INLINE void filter4_##w( \ - const uint8x##w##_t mask, const uint8x##w##_t hev, \ - const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ - const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0, \ - uint8x##w##_t *oq0, uint8x##w##_t *oq1) { \ - int8x##w##_t filter, filter1, filter2, t; \ - int8x##w##_t ps1 = flip_sign_##w(p1); \ - int8x##w##_t ps0 = flip_sign_##w(p0); \ - int8x##w##_t qs0 = flip_sign_##w(q0); \ - int8x##w##_t qs1 = flip_sign_##w(q1); \ - \ - /* add outer taps if we have high edge variance */ \ - filter = vqsub##r##s8(ps1, qs1); \ - filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev)); \ - t = vqsub##r##s8(qs0, ps0); \ - \ - /* inner taps */ \ - filter = vqadd##r##s8(filter, t); \ - filter = vqadd##r##s8(filter, t); \ - filter = vqadd##r##s8(filter, t); \ - filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \ - \ - /* save bottom 3 bits so that we round one side +4 and the other +3 */ \ - /* if it equals 4 we'll set to adjust by -1 to account for the fact */ \ - /* we'd round 3 the other way */ \ - filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \ - filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \ - \ - qs0 = vqsub##r##s8(qs0, filter1); \ - ps0 = vqadd##r##s8(ps0, filter2); \ - *oq0 = flip_sign_back_##w(qs0); \ - *op0 = flip_sign_back_##w(ps0); \ - \ - /* outer tap adjustments */ \ - filter = vrshr##r##n_s8(filter1, 1); \ - filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev)); \ - \ - qs1 = vqsub##r##s8(qs1, filter); \ - ps1 = vqadd##r##s8(ps1, filter); \ - *oq1 = flip_sign_back_##w(qs1); \ - *op1 = flip_sign_back_##w(ps1); \ - } - -FUN_FILTER4(8, _) // filter4_8 -FUN_FILTER4(16, q_) // filter4_16 -#undef FUN_FILTER4 - -#define FUN_FILTER8(w) \ - static INLINE void filter8_##w( \ - const uint8x##w##_t mask, const uint8x##w##_t flat, \ - const uint32_t flat_status, const uint8x##w##_t hev, \ - const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \ - const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \ - const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \ - uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ - uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \ - if (flat_status != (uint32_t)-2) { \ - filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ - *op2 = p2; \ - *oq2 = q2; \ - if (flat_status) { \ - apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ - op0, oq0, oq1, oq2); \ - } \ - } else { \ - calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \ - oq0, oq1, oq2); \ - } \ - } - -FUN_FILTER8(8) // filter8_8 -FUN_FILTER8(16) // filter8_16 -#undef FUN_FILTER8 - -#define FUN_FILTER16(w) \ - static INLINE void filter16_##w( \ - const uint8x##w##_t mask, const uint8x##w##_t flat, \ - const uint32_t flat_status, const uint8x##w##_t flat2, \ - const uint32_t flat2_status, const uint8x##w##_t hev, \ - const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \ - const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ - const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ - const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ - const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ - const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \ - uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \ - uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ - uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \ - uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) { \ - if (flat_status != (uint32_t)-2) { \ - filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ - } \ - \ - if (flat_status) { \ - *op2 = p2; \ - *oq2 = q2; \ - if (flat2_status != (uint32_t)-2) { \ - apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ - op0, oq0, oq1, oq2); \ - } \ - if (flat2_status) { \ - apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \ - q2, q3, q4, q5, q6, q7, op6, op5, op4, op3, \ - op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \ - oq6); \ - } \ - } \ - } - -FUN_FILTER16(8) // filter16_8 -FUN_FILTER16(16) // filter16_16 -#undef FUN_FILTER16 - -#define FUN_LOAD8(w, r) \ - static INLINE void load_##w##x8( \ - const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \ - uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0, \ - uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) { \ - *p3 = vld1##r##u8(s); \ - s += p; \ - *p2 = vld1##r##u8(s); \ - s += p; \ - *p1 = vld1##r##u8(s); \ - s += p; \ - *p0 = vld1##r##u8(s); \ - s += p; \ - *q0 = vld1##r##u8(s); \ - s += p; \ - *q1 = vld1##r##u8(s); \ - s += p; \ - *q2 = vld1##r##u8(s); \ - s += p; \ - *q3 = vld1##r##u8(s); \ - } - -FUN_LOAD8(8, _) // load_8x8 -FUN_LOAD8(16, q_) // load_16x8 -#undef FUN_LOAD8 - -#define FUN_LOAD16(w, r) \ - static INLINE void load_##w##x16( \ - const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \ - uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4, \ - uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7, \ - uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10, \ - uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13, \ - uint8x##w##_t *s14, uint8x##w##_t *s15) { \ - *s0 = vld1##r##u8(s); \ - s += p; \ - *s1 = vld1##r##u8(s); \ - s += p; \ - *s2 = vld1##r##u8(s); \ - s += p; \ - *s3 = vld1##r##u8(s); \ - s += p; \ - *s4 = vld1##r##u8(s); \ - s += p; \ - *s5 = vld1##r##u8(s); \ - s += p; \ - *s6 = vld1##r##u8(s); \ - s += p; \ - *s7 = vld1##r##u8(s); \ - s += p; \ - *s8 = vld1##r##u8(s); \ - s += p; \ - *s9 = vld1##r##u8(s); \ - s += p; \ - *s10 = vld1##r##u8(s); \ - s += p; \ - *s11 = vld1##r##u8(s); \ - s += p; \ - *s12 = vld1##r##u8(s); \ - s += p; \ - *s13 = vld1##r##u8(s); \ - s += p; \ - *s14 = vld1##r##u8(s); \ - s += p; \ - *s15 = vld1##r##u8(s); \ - } - -FUN_LOAD16(8, _) // load_8x16 -FUN_LOAD16(16, q_) // load_16x16 -#undef FUN_LOAD16 - -#define FUN_STORE4(w, r) \ - static INLINE void store_##w##x4( \ - uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ - const uint8x##w##_t s2, const uint8x##w##_t s3) { \ - vst1##r##u8(s, s0); \ - s += p; \ - vst1##r##u8(s, s1); \ - s += p; \ - vst1##r##u8(s, s2); \ - s += p; \ - vst1##r##u8(s, s3); \ - } - -FUN_STORE4(8, _) // store_8x4 -FUN_STORE4(16, q_) // store_16x4 -#undef FUN_STORE4 - -#define FUN_STORE6(w, r) \ - static INLINE void store_##w##x6( \ - uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ - const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ - const uint8x##w##_t s5) { \ - vst1##r##u8(s, s0); \ - s += p; \ - vst1##r##u8(s, s1); \ - s += p; \ - vst1##r##u8(s, s2); \ - s += p; \ - vst1##r##u8(s, s3); \ - s += p; \ - vst1##r##u8(s, s4); \ - s += p; \ - vst1##r##u8(s, s5); \ - } - -FUN_STORE6(8, _) // store_8x6 -FUN_STORE6(16, q_) // store_16x6 -#undef FUN_STORE6 - -static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1, - const uint8x8_t p0, const uint8x8_t q0, - const uint8x8_t q1) { - uint8x8x4_t o; - - o.val[0] = p1; - o.val[1] = p0; - o.val[2] = q0; - o.val[3] = q1; - vst4_lane_u8(s, o, 0); - s += p; - vst4_lane_u8(s, o, 1); - s += p; - vst4_lane_u8(s, o, 2); - s += p; - vst4_lane_u8(s, o, 3); - s += p; - vst4_lane_u8(s, o, 4); - s += p; - vst4_lane_u8(s, o, 5); - s += p; - vst4_lane_u8(s, o, 6); - s += p; - vst4_lane_u8(s, o, 7); -} - -static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0, - const uint8x8_t s1, const uint8x8_t s2, - const uint8x8_t s3, const uint8x8_t s4, - const uint8x8_t s5) { - uint8x8x3_t o0, o1; - - o0.val[0] = s0; - o0.val[1] = s1; - o0.val[2] = s2; - o1.val[0] = s3; - o1.val[1] = s4; - o1.val[2] = s5; - vst3_lane_u8(s - 3, o0, 0); - vst3_lane_u8(s + 0, o1, 0); - s += p; - vst3_lane_u8(s - 3, o0, 1); - vst3_lane_u8(s + 0, o1, 1); - s += p; - vst3_lane_u8(s - 3, o0, 2); - vst3_lane_u8(s + 0, o1, 2); - s += p; - vst3_lane_u8(s - 3, o0, 3); - vst3_lane_u8(s + 0, o1, 3); - s += p; - vst3_lane_u8(s - 3, o0, 4); - vst3_lane_u8(s + 0, o1, 4); - s += p; - vst3_lane_u8(s - 3, o0, 5); - vst3_lane_u8(s + 0, o1, 5); - s += p; - vst3_lane_u8(s - 3, o0, 6); - vst3_lane_u8(s + 0, o1, 6); - s += p; - vst3_lane_u8(s - 3, o0, 7); - vst3_lane_u8(s + 0, o1, 7); -} - -#define FUN_STORE8(w, r) \ - static INLINE void store_##w##x8( \ - uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ - const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ - const uint8x##w##_t s5, const uint8x##w##_t s6, \ - const uint8x##w##_t s7) { \ - vst1##r##u8(s, s0); \ - s += p; \ - vst1##r##u8(s, s1); \ - s += p; \ - vst1##r##u8(s, s2); \ - s += p; \ - vst1##r##u8(s, s3); \ - s += p; \ - vst1##r##u8(s, s4); \ - s += p; \ - vst1##r##u8(s, s5); \ - s += p; \ - vst1##r##u8(s, s6); \ - s += p; \ - vst1##r##u8(s, s7); \ - } - -FUN_STORE8(8, _) // store_8x8 -FUN_STORE8(16, q_) // store_16x8 -#undef FUN_STORE8 - -#define FUN_STORE14(w, r) \ - static INLINE void store_##w##x14( \ - uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \ - const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ - const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ - const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ - const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ - const uint32_t flat_status, const uint32_t flat2_status) { \ - if (flat_status) { \ - if (flat2_status) { \ - vst1##r##u8(s - 7 * p, p6); \ - vst1##r##u8(s - 6 * p, p5); \ - vst1##r##u8(s - 5 * p, p4); \ - vst1##r##u8(s - 4 * p, p3); \ - vst1##r##u8(s + 3 * p, q3); \ - vst1##r##u8(s + 4 * p, q4); \ - vst1##r##u8(s + 5 * p, q5); \ - vst1##r##u8(s + 6 * p, q6); \ - } \ - vst1##r##u8(s - 3 * p, p2); \ - vst1##r##u8(s + 2 * p, q2); \ - } \ - vst1##r##u8(s - 2 * p, p1); \ - vst1##r##u8(s - 1 * p, p0); \ - vst1##r##u8(s + 0 * p, q0); \ - vst1##r##u8(s + 1 * p, q1); \ - } - -FUN_STORE14(8, _) // store_8x14 -FUN_STORE14(16, q_) // store_16x14 -#undef FUN_STORE14 - -static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0, - const uint8x16_t s1, const uint8x16_t s2, - const uint8x16_t s3, const uint8x16_t s4, - const uint8x16_t s5, const uint8x16_t s6, - const uint8x16_t s7, const uint8x16_t s8, - const uint8x16_t s9, const uint8x16_t s10, - const uint8x16_t s11, const uint8x16_t s12, - const uint8x16_t s13, const uint8x16_t s14, - const uint8x16_t s15) { - vst1q_u8(s, s0); - s += p; - vst1q_u8(s, s1); - s += p; - vst1q_u8(s, s2); - s += p; - vst1q_u8(s, s3); - s += p; - vst1q_u8(s, s4); - s += p; - vst1q_u8(s, s5); - s += p; - vst1q_u8(s, s6); - s += p; - vst1q_u8(s, s7); - s += p; - vst1q_u8(s, s8); - s += p; - vst1q_u8(s, s9); - s += p; - vst1q_u8(s, s10); - s += p; - vst1q_u8(s, s11); - s += p; - vst1q_u8(s, s12); - s += p; - vst1q_u8(s, s13); - s += p; - vst1q_u8(s, s14); - s += p; - vst1q_u8(s, s15); -} - -#define FUN_HOR_4_KERNEL(name, w) \ - static INLINE void lpf_horizontal_4##name##kernel( \ - uint8_t *s, const int p, const uint8x##w##_t blimit, \ - const uint8x##w##_t limit, const uint8x##w##_t thresh) { \ - uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \ - \ - load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \ - filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \ - q3, &hev, &mask); \ - filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \ - store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \ - } - -FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel -FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel -#undef FUN_HOR_4_KERNEL - -void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t blimit_vec, limit_vec, thresh_vec; - load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); - lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec); -} - -void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - uint8x16_t blimit_vec, limit_vec, thresh_vec; - load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, - &blimit_vec, &limit_vec, &thresh_vec); - lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec); -} - -void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, - mask, hev; - load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); - load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, - q2, q3, &hev, &mask); - filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); - store_4x8(s - 2, p, p1, p0, q0, q1); -} - -void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, - mask, hev; - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, - s15; - - load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, - &blimit_vec, &limit_vec, &thresh_vec); - load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, - &s11, &s12, &s13, &s14, &s15); - transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, - s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, - q2, q3, &hev, &mask); - filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); - s -= 2; - store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0), - vget_low_u8(q1)); - store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), - vget_high_u8(q1)); -} - -void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, - op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; - uint32_t flat_status; - - load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); - load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, - p0, q0, q1, q2, q3, &flat, &flat_status, &hev); - filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, - &op1, &op0, &oq0, &oq1, &oq2); - store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); -} - -void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, - op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; - uint32_t flat_status; - - load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, - &blimit_vec, &limit_vec, &thresh_vec); - load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, - p0, q0, q1, q2, q3, &flat, &flat_status, &hev); - filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, - &op1, &op0, &oq0, &oq1, &oq2); - store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); -} - -void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, - op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; - uint32_t flat_status; - - load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); - load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, - p0, q0, q1, q2, q3, &flat, &flat_status, &hev); - filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, - &op1, &op0, &oq0, &oq1, &oq2); - // Note: tranpose + store_8x8() is faster than store_6x8(). - transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); - store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); -} - -void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, - op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, - s15; - uint32_t flat_status; - - load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, - &blimit_vec, &limit_vec, &thresh_vec); - load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, - &s11, &s12, &s13, &s14, &s15); - transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, - s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, - p0, q0, q1, q2, q3, &flat, &flat_status, &hev); - filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, - &op1, &op0, &oq0, &oq1, &oq2); - // Note: store_6x8() twice is faster than tranpose + store_8x16(). - store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), - vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); - store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), - vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), - vget_high_u8(oq2)); -} - -#define FUN_LPF_16_KERNEL(name, w) \ - static INLINE void lpf_16##name##kernel( \ - const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \ - const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \ - const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ - const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ - const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ - const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ - const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \ - uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \ - uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ - uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \ - uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6, \ - uint32_t *flat_status, uint32_t *flat2_status) { \ - uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev; \ - \ - load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec, \ - &thresh_vec); \ - mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \ - p1, p0, q0, q1, q2, q3, &flat, \ - flat_status, &hev); \ - flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, \ - flat2_status); \ - filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6, \ - p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, \ - op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \ - oq6); \ - } - -FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel -FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel -#undef FUN_LPF_16_KERNEL - -void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, - op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; - uint32_t flat_status, flat2_status; - - load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2, - &q3, &q4, &q5, &q6, &q7); - lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, - q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1, - &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status, - &flat2_status); - store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, - oq5, oq6, flat_status, flat2_status); -} - -void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, - op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; - uint32_t flat_status, flat2_status; - - load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); - p7 = vld1q_u8(s - 8 * p); - p6 = vld1q_u8(s - 7 * p); - p5 = vld1q_u8(s - 6 * p); - p4 = vld1q_u8(s - 5 * p); - q4 = vld1q_u8(s + 4 * p); - q5 = vld1q_u8(s + 5 * p); - q6 = vld1q_u8(s + 6 * p); - q7 = vld1q_u8(s + 7 * p); - lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, - q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, - &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, - &flat_status, &flat2_status); - store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, - oq5, oq6, flat_status, flat2_status); -} - -void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, - op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; - uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; - uint32_t flat_status, flat2_status; - - s -= 8; - load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3, - &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); - lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, - q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1, - &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status, - &flat2_status); - if (flat_status) { - if (flat2_status) { - transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, - oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5, - &s6, &s7); - store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); - } else { - // Note: tranpose + store_8x8() is faster than store_6x8(). - transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); - store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); - } - } else { - store_4x8(s + 6, p, op1, op0, oq0, oq1); - } -} - -void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, - op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; - uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, - s15; - uint32_t flat_status, flat2_status; - - s -= 8; - load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11, - &s12, &s13, &s14, &s15); - transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, - s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, - &q2, &q3, &q4, &q5, &q6, &q7); - lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, - q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, - &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, - &flat_status, &flat2_status); - if (flat_status) { - if (flat2_status) { - transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, - oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5, - &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14, - &s15); - store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, - s13, s14, s15); - } else { - // Note: store_6x8() twice is faster than tranpose + store_8x16(). - s += 8; - store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), - vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); - store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), - vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), - vget_high_u8(oq2)); - } - } else { - s += 6; - store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0), - vget_low_u8(oq1)); - store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0), - vget_high_u8(oq0), vget_high_u8(oq1)); - } -} diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c b/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c index ced5aef0ab..7419cea022 100644 --- a/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c +++ b/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -9,15 +9,1084 @@ */ #include - -#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" -#include "vpx/vpx_integer.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/transpose_neon.h" + +// For all the static inline functions, the functions ending with '_8' process +// 8 samples in a bunch, and the functions ending with '_16' process 16 samples +// in a bunch. + +#define FUN_LOAD_THRESH(w, r) \ + static INLINE void load_thresh_##w( \ + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \ + uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec, \ + uint8x##w##_t *thresh_vec) { \ + *blimit_vec = vld1##r##dup_u8(blimit); \ + *limit_vec = vld1##r##dup_u8(limit); \ + *thresh_vec = vld1##r##dup_u8(thresh); \ + } + +FUN_LOAD_THRESH(8, _) // load_thresh_8 +FUN_LOAD_THRESH(16, q_) // load_thresh_16 +#undef FUN_LOAD_THRESH + +static INLINE void load_thresh_8_dual( + const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, + uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) { + *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1)); + *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1)); + *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1)); +} + +// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a +// pixel. When used to control filter branches, we only detect whether it is all +// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status. +// flat equals 0 if and only if flat_status equals 0. +// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true +// because each mask occupies more than 1 bit.) +static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) { + return vget_lane_u32( + vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0); +} + +// Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel. +// When used to control filter branches, we only detect whether it is all 0s or +// all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so +// we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel. +// Then we pairwise add flat to a 32-bit long number flat_status. +// flat equals 0 if and only if flat_status equals 0. +// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true +// because each mask occupies more than 1 bit.) +static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) { + const uint8x8_t flat_4bit = + vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4)); + return calc_flat_status_8(flat_4bit); +} + +#define FUN_FILTER_HEV_MASK4(w, r) \ + static INLINE uint8x##w##_t filter_hev_mask4_##w( \ + const uint8x##w##_t limit, const uint8x##w##_t blimit, \ + const uint8x##w##_t thresh, const uint8x##w##_t p3, \ + const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ + const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ + const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \ + uint8x##w##_t max, t0, t1; \ + \ + max = vabd##r##u8(p1, p0); \ + max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \ + *hev = vcgt##r##u8(max, thresh); \ + *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \ + *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \ + t0 = vabd##r##u8(p0, q0); \ + t1 = vabd##r##u8(p1, q1); \ + t0 = vqadd##r##u8(t0, t0); \ + t1 = vshr##r##n_u8(t1, 1); \ + t0 = vqadd##r##u8(t0, t1); \ + *mask = vcle##r##u8(*mask, limit); \ + t0 = vcle##r##u8(t0, blimit); \ + *mask = vand##r##u8(*mask, t0); \ + \ + return max; \ + } + +FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8 +FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16 +#undef FUN_FILTER_HEV_MASK4 + +#define FUN_FILTER_FLAT_HEV_MASK(w, r) \ + static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \ + const uint8x##w##_t limit, const uint8x##w##_t blimit, \ + const uint8x##w##_t thresh, const uint8x##w##_t p3, \ + const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ + const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ + const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \ + uint8x##w##_t *hev) { \ + uint8x##w##_t max, mask; \ + \ + max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \ + q2, q3, hev, &mask); \ + *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \ + *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \ + *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \ + *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \ + *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */ \ + *flat = vand##r##u8(*flat, mask); \ + *flat_status = calc_flat_status_##w(*flat); \ + \ + return mask; \ + } + +FUN_FILTER_FLAT_HEV_MASK(8, _) // filter_flat_hev_mask_8 +FUN_FILTER_FLAT_HEV_MASK(16, q_) // filter_flat_hev_mask_16 +#undef FUN_FILTER_FLAT_HEV_MASK + +#define FUN_FLAT_MASK5(w, r) \ + static INLINE uint8x##w##_t flat_mask5_##w( \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t flat, \ + uint32_t *flat2_status) { \ + uint8x##w##_t flat2 = vabd##r##u8(p4, p0); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0)); \ + flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0)); \ + flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1)); \ + flat2 = vand##r##u8(flat2, flat); \ + *flat2_status = calc_flat_status_##w(flat2); \ + \ + return flat2; \ + } + +FUN_FLAT_MASK5(8, _) // flat_mask5_8 +FUN_FLAT_MASK5(16, q_) // flat_mask5_16 +#undef FUN_FLAT_MASK5 + +#define FUN_FLIP_SIGN(w, r) \ + static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \ + const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80); \ + return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit)); \ + } + +FUN_FLIP_SIGN(8, _) // flip_sign_8 +FUN_FLIP_SIGN(16, q_) // flip_sign_16 +#undef FUN_FLIP_SIGN + +#define FUN_FLIP_SIGN_BACK(w, r) \ + static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \ + const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \ + return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \ + } + +FUN_FLIP_SIGN_BACK(8, _) // flip_sign_back_8 +FUN_FLIP_SIGN_BACK(16, q_) // flip_sign_back_16 +#undef FUN_FLIP_SIGN_BACK + +static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1, + const uint8x8_t add0, const uint8x8_t add1, + uint16x8_t *sum) { + *sum = vsubw_u8(*sum, sub0); + *sum = vsubw_u8(*sum, sub1); + *sum = vaddw_u8(*sum, add0); + *sum = vaddw_u8(*sum, add1); +} + +static INLINE void filter_update_16(const uint8x16_t sub0, + const uint8x16_t sub1, + const uint8x16_t add0, + const uint8x16_t add1, uint16x8_t *sum0, + uint16x8_t *sum1) { + *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0)); + *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0)); + *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1)); + *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1)); + *sum0 = vaddw_u8(*sum0, vget_low_u8(add0)); + *sum1 = vaddw_u8(*sum1, vget_high_u8(add0)); + *sum0 = vaddw_u8(*sum0, vget_low_u8(add1)); + *sum1 = vaddw_u8(*sum1, vget_high_u8(add1)); +} + +static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0, + const uint8x8_t sub1, + const uint8x8_t add0, + const uint8x8_t add1, + uint16x8_t *sum) { + filter_update_8(sub0, sub1, add0, add1, sum); + return vrshrn_n_u16(*sum, 3); +} + +static INLINE uint8x16_t calc_7_tap_filter_16_kernel( + const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0, + const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) { + filter_update_16(sub0, sub1, add0, add1, sum0, sum1); + return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3)); +} + +static INLINE uint8x8_t apply_15_tap_filter_8_kernel( + const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1, + const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in, + uint16x8_t *sum) { + filter_update_8(sub0, sub1, add0, add1, sum); + return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in); +} + +static INLINE uint8x16_t apply_15_tap_filter_16_kernel( + const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1, + const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in, + uint16x8_t *sum0, uint16x8_t *sum1) { + uint8x16_t t; + filter_update_16(sub0, sub1, add0, add1, sum0, sum1); + t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4)); + return vbslq_u8(flat, t, in); +} + +// 7-tap filter [1, 1, 1, 2, 1, 1, 1] +static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2, + const uint8x8_t p1, const uint8x8_t p0, + const uint8x8_t q0, const uint8x8_t q1, + const uint8x8_t q2, const uint8x8_t q3, + uint8x8_t *op2, uint8x8_t *op1, + uint8x8_t *op0, uint8x8_t *oq0, + uint8x8_t *oq1, uint8x8_t *oq2) { + uint16x8_t sum; + sum = vaddl_u8(p3, p3); // 2*p3 + sum = vaddw_u8(sum, p3); // 3*p3 + sum = vaddw_u8(sum, p2); // 3*p3+p2 + sum = vaddw_u8(sum, p2); // 3*p3+2*p2 + sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1 + sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0 + sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0 + *op2 = vrshrn_n_u16(sum, 3); + *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum); + *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum); + *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum); + *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum); + *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum); +} + +static INLINE void calc_7_tap_filter_16( + const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1, + const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, + const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1, + uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) { + uint16x8_t sum0, sum1; + sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3)); // 2*p3 + sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3)); // 2*p3 + sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 3*p3 + sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 3*p3 + sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+p2 + sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+p2 + sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+2*p2 + sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+2*p2 + sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 3*p3+2*p2+p1 + sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 3*p3+2*p2+p1 + sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 3*p3+2*p2+p1+p0 + sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 3*p3+2*p2+p1+p0 + sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 3*p3+2*p2+p1+p0+q0 + sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 3*p3+2*p2+p1+p0+q0 + *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3)); + *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1); + *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1); + *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1); + *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1); + *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1); +} + +#define FUN_APPLY_7_TAP_FILTER(w, r) \ + static INLINE void apply_7_tap_filter_##w( \ + const uint8x##w##_t flat, const uint8x##w##_t p3, \ + const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ + const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ + const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1, \ + uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1, \ + uint8x##w##_t *oq2) { \ + uint8x##w##_t tp1, tp0, tq0, tq1; \ + calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, \ + &tq0, &tq1, oq2); \ + *op2 = vbsl##r##u8(flat, *op2, p2); \ + *op1 = vbsl##r##u8(flat, tp1, *op1); \ + *op0 = vbsl##r##u8(flat, tp0, *op0); \ + *oq0 = vbsl##r##u8(flat, tq0, *oq0); \ + *oq1 = vbsl##r##u8(flat, tq1, *oq1); \ + *oq2 = vbsl##r##u8(flat, *oq2, q2); \ + } + +FUN_APPLY_7_TAP_FILTER(8, _) // apply_7_tap_filter_8 +FUN_APPLY_7_TAP_FILTER(16, q_) // apply_7_tap_filter_16 +#undef FUN_APPLY_7_TAP_FILTER + +// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] +static INLINE void apply_15_tap_filter_8( + const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6, + const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3, + const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0, + const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2, + const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5, + const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5, + uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1, + uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2, + uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) { + uint16x8_t sum; + sum = vshll_n_u8(p7, 3); // 8*p7 + sum = vsubw_u8(sum, p7); // 7*p7 + sum = vaddw_u8(sum, p6); // 7*p7+p6 + sum = vaddw_u8(sum, p6); // 7*p7+2*p6 + sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5 + sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4 + sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3 + sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2 + sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6); + *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum); + *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum); + *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum); + *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum); + *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum); + *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum); + *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum); + *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum); + *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum); + *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum); + *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum); + *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum); + *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum); +} + +static INLINE void apply_15_tap_filter_16( + const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6, + const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3, + const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, + const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5, + const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5, + uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1, + uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2, + uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) { + uint16x8_t sum0, sum1; + uint8x16_t t; + sum0 = vshll_n_u8(vget_low_u8(p7), 3); // 8*p7 + sum1 = vshll_n_u8(vget_high_u8(p7), 3); // 8*p7 + sum0 = vsubw_u8(sum0, vget_low_u8(p7)); // 7*p7 + sum1 = vsubw_u8(sum1, vget_high_u8(p7)); // 7*p7 + sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+p6 + sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+p6 + sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+2*p6 + sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+2*p6 + sum0 = vaddw_u8(sum0, vget_low_u8(p5)); // 7*p7+2*p6+p5 + sum1 = vaddw_u8(sum1, vget_high_u8(p5)); // 7*p7+2*p6+p5 + sum0 = vaddw_u8(sum0, vget_low_u8(p4)); // 7*p7+2*p6+p5+p4 + sum1 = vaddw_u8(sum1, vget_high_u8(p4)); // 7*p7+2*p6+p5+p4 + sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 7*p7+2*p6+p5+p4+p3 + sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 7*p7+2*p6+p5+p4+p3 + sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2 + sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2 + sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4)); + *op6 = vbslq_u8(flat2, t, p6); + *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1); + *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1); + *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1); + *op2 = + apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1); + *op1 = + apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1); + *op0 = + apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1); + *oq0 = + apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1); + *oq1 = + apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1); + *oq2 = + apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1); + *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1); + *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1); + *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1); + *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1); +} + +#define FUN_FILTER4(w, r) \ + static INLINE void filter4_##w( \ + const uint8x##w##_t mask, const uint8x##w##_t hev, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0, \ + uint8x##w##_t *oq0, uint8x##w##_t *oq1) { \ + int8x##w##_t filter, filter1, filter2, t; \ + int8x##w##_t ps1 = flip_sign_##w(p1); \ + int8x##w##_t ps0 = flip_sign_##w(p0); \ + int8x##w##_t qs0 = flip_sign_##w(q0); \ + int8x##w##_t qs1 = flip_sign_##w(q1); \ + \ + /* add outer taps if we have high edge variance */ \ + filter = vqsub##r##s8(ps1, qs1); \ + filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev)); \ + t = vqsub##r##s8(qs0, ps0); \ + \ + /* inner taps */ \ + filter = vqadd##r##s8(filter, t); \ + filter = vqadd##r##s8(filter, t); \ + filter = vqadd##r##s8(filter, t); \ + filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \ + \ + /* save bottom 3 bits so that we round one side +4 and the other +3 */ \ + /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \ + /* we'd round it by 3 the other way */ \ + filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \ + filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \ + \ + qs0 = vqsub##r##s8(qs0, filter1); \ + ps0 = vqadd##r##s8(ps0, filter2); \ + *oq0 = flip_sign_back_##w(qs0); \ + *op0 = flip_sign_back_##w(ps0); \ + \ + /* outer tap adjustments */ \ + filter = vrshr##r##n_s8(filter1, 1); \ + filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev)); \ + \ + qs1 = vqsub##r##s8(qs1, filter); \ + ps1 = vqadd##r##s8(ps1, filter); \ + *oq1 = flip_sign_back_##w(qs1); \ + *op1 = flip_sign_back_##w(ps1); \ + } + +FUN_FILTER4(8, _) // filter4_8 +FUN_FILTER4(16, q_) // filter4_16 +#undef FUN_FILTER4 + +#define FUN_FILTER8(w) \ + static INLINE void filter8_##w( \ + const uint8x##w##_t mask, const uint8x##w##_t flat, \ + const uint32_t flat_status, const uint8x##w##_t hev, \ + const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \ + const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \ + const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \ + uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ + uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \ + if (flat_status != (uint32_t)-2) { \ + filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ + *op2 = p2; \ + *oq2 = q2; \ + if (flat_status) { \ + apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ + op0, oq0, oq1, oq2); \ + } \ + } else { \ + calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \ + oq0, oq1, oq2); \ + } \ + } + +FUN_FILTER8(8) // filter8_8 +FUN_FILTER8(16) // filter8_16 +#undef FUN_FILTER8 + +#define FUN_FILTER16(w) \ + static INLINE void filter16_##w( \ + const uint8x##w##_t mask, const uint8x##w##_t flat, \ + const uint32_t flat_status, const uint8x##w##_t flat2, \ + const uint32_t flat2_status, const uint8x##w##_t hev, \ + const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ + const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \ + uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \ + uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ + uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \ + uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) { \ + if (flat_status != (uint32_t)-2) { \ + filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ + } \ + \ + if (flat_status) { \ + *op2 = p2; \ + *oq2 = q2; \ + if (flat2_status != (uint32_t)-2) { \ + apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ + op0, oq0, oq1, oq2); \ + } \ + if (flat2_status) { \ + apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \ + q2, q3, q4, q5, q6, q7, op6, op5, op4, op3, \ + op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \ + oq6); \ + } \ + } \ + } + +FUN_FILTER16(8) // filter16_8 +FUN_FILTER16(16) // filter16_16 +#undef FUN_FILTER16 + +#define FUN_LOAD8(w, r) \ + static INLINE void load_##w##x8( \ + const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \ + uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0, \ + uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) { \ + *p3 = vld1##r##u8(s); \ + s += p; \ + *p2 = vld1##r##u8(s); \ + s += p; \ + *p1 = vld1##r##u8(s); \ + s += p; \ + *p0 = vld1##r##u8(s); \ + s += p; \ + *q0 = vld1##r##u8(s); \ + s += p; \ + *q1 = vld1##r##u8(s); \ + s += p; \ + *q2 = vld1##r##u8(s); \ + s += p; \ + *q3 = vld1##r##u8(s); \ + } + +FUN_LOAD8(8, _) // load_8x8 +FUN_LOAD8(16, q_) // load_16x8 +#undef FUN_LOAD8 + +#define FUN_LOAD16(w, r) \ + static INLINE void load_##w##x16( \ + const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \ + uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4, \ + uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7, \ + uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10, \ + uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13, \ + uint8x##w##_t *s14, uint8x##w##_t *s15) { \ + *s0 = vld1##r##u8(s); \ + s += p; \ + *s1 = vld1##r##u8(s); \ + s += p; \ + *s2 = vld1##r##u8(s); \ + s += p; \ + *s3 = vld1##r##u8(s); \ + s += p; \ + *s4 = vld1##r##u8(s); \ + s += p; \ + *s5 = vld1##r##u8(s); \ + s += p; \ + *s6 = vld1##r##u8(s); \ + s += p; \ + *s7 = vld1##r##u8(s); \ + s += p; \ + *s8 = vld1##r##u8(s); \ + s += p; \ + *s9 = vld1##r##u8(s); \ + s += p; \ + *s10 = vld1##r##u8(s); \ + s += p; \ + *s11 = vld1##r##u8(s); \ + s += p; \ + *s12 = vld1##r##u8(s); \ + s += p; \ + *s13 = vld1##r##u8(s); \ + s += p; \ + *s14 = vld1##r##u8(s); \ + s += p; \ + *s15 = vld1##r##u8(s); \ + } + +FUN_LOAD16(8, _) // load_8x16 +FUN_LOAD16(16, q_) // load_16x16 +#undef FUN_LOAD16 + +#define FUN_STORE4(w, r) \ + static INLINE void store_##w##x4( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + } + +FUN_STORE4(8, _) // store_8x4 +FUN_STORE4(16, q_) // store_16x4 +#undef FUN_STORE4 + +#define FUN_STORE6(w, r) \ + static INLINE void store_##w##x6( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ + const uint8x##w##_t s5) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + s += p; \ + vst1##r##u8(s, s4); \ + s += p; \ + vst1##r##u8(s, s5); \ + } + +FUN_STORE6(8, _) // store_8x6 +FUN_STORE6(16, q_) // store_16x6 +#undef FUN_STORE6 + +static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1, + const uint8x8_t p0, const uint8x8_t q0, + const uint8x8_t q1) { + uint8x8x4_t o; + + o.val[0] = p1; + o.val[1] = p0; + o.val[2] = q0; + o.val[3] = q1; + vst4_lane_u8(s, o, 0); + s += p; + vst4_lane_u8(s, o, 1); + s += p; + vst4_lane_u8(s, o, 2); + s += p; + vst4_lane_u8(s, o, 3); + s += p; + vst4_lane_u8(s, o, 4); + s += p; + vst4_lane_u8(s, o, 5); + s += p; + vst4_lane_u8(s, o, 6); + s += p; + vst4_lane_u8(s, o, 7); +} + +static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0, + const uint8x8_t s1, const uint8x8_t s2, + const uint8x8_t s3, const uint8x8_t s4, + const uint8x8_t s5) { + uint8x8x3_t o0, o1; + + o0.val[0] = s0; + o0.val[1] = s1; + o0.val[2] = s2; + o1.val[0] = s3; + o1.val[1] = s4; + o1.val[2] = s5; + vst3_lane_u8(s - 3, o0, 0); + vst3_lane_u8(s + 0, o1, 0); + s += p; + vst3_lane_u8(s - 3, o0, 1); + vst3_lane_u8(s + 0, o1, 1); + s += p; + vst3_lane_u8(s - 3, o0, 2); + vst3_lane_u8(s + 0, o1, 2); + s += p; + vst3_lane_u8(s - 3, o0, 3); + vst3_lane_u8(s + 0, o1, 3); + s += p; + vst3_lane_u8(s - 3, o0, 4); + vst3_lane_u8(s + 0, o1, 4); + s += p; + vst3_lane_u8(s - 3, o0, 5); + vst3_lane_u8(s + 0, o1, 5); + s += p; + vst3_lane_u8(s - 3, o0, 6); + vst3_lane_u8(s + 0, o1, 6); + s += p; + vst3_lane_u8(s - 3, o0, 7); + vst3_lane_u8(s + 0, o1, 7); +} + +#define FUN_STORE8(w, r) \ + static INLINE void store_##w##x8( \ + uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ + const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ + const uint8x##w##_t s5, const uint8x##w##_t s6, \ + const uint8x##w##_t s7) { \ + vst1##r##u8(s, s0); \ + s += p; \ + vst1##r##u8(s, s1); \ + s += p; \ + vst1##r##u8(s, s2); \ + s += p; \ + vst1##r##u8(s, s3); \ + s += p; \ + vst1##r##u8(s, s4); \ + s += p; \ + vst1##r##u8(s, s5); \ + s += p; \ + vst1##r##u8(s, s6); \ + s += p; \ + vst1##r##u8(s, s7); \ + } + +FUN_STORE8(8, _) // store_8x8 +FUN_STORE8(16, q_) // store_16x8 +#undef FUN_STORE8 + +#define FUN_STORE14(w, r) \ + static INLINE void store_##w##x14( \ + uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ + const uint32_t flat_status, const uint32_t flat2_status) { \ + if (flat_status) { \ + if (flat2_status) { \ + vst1##r##u8(s - 7 * p, p6); \ + vst1##r##u8(s - 6 * p, p5); \ + vst1##r##u8(s - 5 * p, p4); \ + vst1##r##u8(s - 4 * p, p3); \ + vst1##r##u8(s + 3 * p, q3); \ + vst1##r##u8(s + 4 * p, q4); \ + vst1##r##u8(s + 5 * p, q5); \ + vst1##r##u8(s + 6 * p, q6); \ + } \ + vst1##r##u8(s - 3 * p, p2); \ + vst1##r##u8(s + 2 * p, q2); \ + } \ + vst1##r##u8(s - 2 * p, p1); \ + vst1##r##u8(s - 1 * p, p0); \ + vst1##r##u8(s + 0 * p, q0); \ + vst1##r##u8(s + 1 * p, q1); \ + } + +FUN_STORE14(8, _) // store_8x14 +FUN_STORE14(16, q_) // store_16x14 +#undef FUN_STORE14 + +static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0, + const uint8x16_t s1, const uint8x16_t s2, + const uint8x16_t s3, const uint8x16_t s4, + const uint8x16_t s5, const uint8x16_t s6, + const uint8x16_t s7, const uint8x16_t s8, + const uint8x16_t s9, const uint8x16_t s10, + const uint8x16_t s11, const uint8x16_t s12, + const uint8x16_t s13, const uint8x16_t s14, + const uint8x16_t s15) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); + s += p; + vst1q_u8(s, s4); + s += p; + vst1q_u8(s, s5); + s += p; + vst1q_u8(s, s6); + s += p; + vst1q_u8(s, s7); + s += p; + vst1q_u8(s, s8); + s += p; + vst1q_u8(s, s9); + s += p; + vst1q_u8(s, s10); + s += p; + vst1q_u8(s, s11); + s += p; + vst1q_u8(s, s12); + s += p; + vst1q_u8(s, s13); + s += p; + vst1q_u8(s, s14); + s += p; + vst1q_u8(s, s15); +} + +#define FUN_HOR_4_KERNEL(name, w) \ + static INLINE void lpf_horizontal_4##name##kernel( \ + uint8_t *s, const int p, const uint8x##w##_t blimit, \ + const uint8x##w##_t limit, const uint8x##w##_t thresh) { \ + uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \ + \ + load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \ + filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \ + q3, &hev, &mask); \ + filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \ + store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \ + } + +FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel +FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel +#undef FUN_HOR_4_KERNEL + +void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec); +} + +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec; + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec); +} + +void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); + store_4x8(s - 2, p, p1, p0, q0, q1); +} void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); - vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, + &s11, &s12, &s13, &s14, &s15); + transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); + s -= 2; + store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0), + vget_low_u8(q1)); + store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), + vget_high_u8(q1)); +} + +void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + // Note: transpose + store_8x8() is faster than store_6x8(). + transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); + store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); +} + +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + uint32_t flat_status; + + load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, + &blimit_vec, &limit_vec, &thresh_vec); + load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, + &s11, &s12, &s13, &s14, &s15); + transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, + p0, q0, q1, q2, q3, &flat, &flat_status, &hev); + filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2); + // Note: store_6x8() twice is faster than transpose + store_8x16(). + store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), + vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); + store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), + vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), + vget_high_u8(oq2)); +} + +#define FUN_LPF_16_KERNEL(name, w) \ + static INLINE void lpf_16##name##kernel( \ + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \ + const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \ + const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ + const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ + const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ + const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ + const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \ + uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \ + uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ + uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \ + uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6, \ + uint32_t *flat_status, uint32_t *flat2_status) { \ + uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev; \ + \ + load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec, \ + &thresh_vec); \ + mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \ + p1, p0, q0, q1, q2, q3, &flat, \ + flat_status, &hev); \ + flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, \ + flat2_status); \ + filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6, \ + p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, \ + op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \ + oq6); \ + } + +FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel +FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel +#undef FUN_LPF_16_KERNEL + +void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, + op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2, + &q3, &q4, &q5, &q6, &q7); + lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, + q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1, + &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status, + &flat2_status); + store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, + oq5, oq6, flat_status, flat2_status); +} + +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, + op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + p7 = vld1q_u8(s - 8 * p); + p6 = vld1q_u8(s - 7 * p); + p5 = vld1q_u8(s - 6 * p); + p4 = vld1q_u8(s - 5 * p); + q4 = vld1q_u8(s + 4 * p); + q5 = vld1q_u8(s + 5 * p); + q6 = vld1q_u8(s + 6 * p); + q7 = vld1q_u8(s + 7 * p); + lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, + q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + &flat_status, &flat2_status); + store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, + oq5, oq6, flat_status, flat2_status); +} + +void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, + op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; + uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; + uint32_t flat_status, flat2_status; + + s -= 8; + load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3, + &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, + q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1, + &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status, + &flat2_status); + if (flat_status) { + if (flat2_status) { + transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, + oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5, + &s6, &s7); + store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); + } else { + // Note: transpose + store_8x8() is faster than store_6x8(). + transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); + store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); + } + } else { + store_4x8(s + 6, p, op1, op0, oq0, oq1); + } +} + +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, + op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; + uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + uint32_t flat_status, flat2_status; + + s -= 8; + load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11, + &s12, &s13, &s14, &s15); + transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, + &q2, &q3, &q4, &q5, &q6, &q7); + lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, + q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + &flat_status, &flat2_status); + if (flat_status) { + if (flat2_status) { + transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, + oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5, + &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14, + &s15); + store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, s14, s15); + } else { + // Note: store_6x8() twice is faster than transpose + store_8x16(). + s += 8; + store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), + vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); + store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), + vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), + vget_high_u8(oq2)); + } + } else { + s += 6; + store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0), + vget_low_u8(oq1)); + store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0), + vget_high_u8(oq0), vget_high_u8(oq1)); + } } diff --git a/libs/libvpx/vpx_dsp/arm/mem_neon.h b/libs/libvpx/vpx_dsp/arm/mem_neon.h new file mode 100644 index 0000000000..4efad5333e --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/mem_neon.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_MEM_NEON_H_ +#define VPX_DSP_ARM_MEM_NEON_H_ + +#include +#include +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Helper functions used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4x2_t v0 = vld2q_s32(buf); + const int32x4x2_t v1 = vld2q_s32(buf + 8); + const int16x4_t s0 = vmovn_s32(v0.val[0]); + const int16x4_t s1 = vmovn_s32(v0.val[1]); + const int16x4_t s2 = vmovn_s32(v1.val[0]); + const int16x4_t s3 = vmovn_s32(v1.val[1]); + int16x8x2_t res; + res.val[0] = vcombine_s16(s0, s2); + res.val[1] = vcombine_s16(s1, s3); + return res; +#else + return vld2q_s16(buf); +#endif +} + +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +#else + return vld1q_s16(buf); +#endif +} + +static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + return vmovn_s32(v0); +#else + return vld1_s16(buf); +#endif +} + +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +#else + vst1q_s16(buf, a); +#endif +} + +// Propagate type information to the compiler. Without this the compiler may +// assume the required alignment of uint32_t (4 bytes) and add alignment hints +// to the memory access. +// +// This is used for functions operating on uint8_t which wish to load or store 4 +// values at a time but which may not be on 4 byte boundaries. +static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { + memcpy(buf, &a, 4); +} + +// Load 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { + uint32_t a; + uint32x2_t a_u32 = vdup_n_u32(0); + if (stride == 4) return vld1_u8(buf); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vld1_lane_u32(&a, a_u32, 0); + memcpy(&a, buf, 4); + a_u32 = vld1_lane_u32(&a, a_u32, 1); + return vreinterpret_u8_u32(a_u32); +} + +// Store 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE void store_unaligned_u8(uint8_t *buf, int stride, + const uint8x8_t a) { + const uint32x2_t a_u32 = vreinterpret_u32_u8(a); + if (stride == 4) { + vst1_u8(buf, a); + return; + } + uint32_to_mem(buf, vget_lane_u32(a_u32, 0)); + buf += stride; + uint32_to_mem(buf, vget_lane_u32(a_u32, 1)); +} + +// Load 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { + uint32_t a; + uint32x4_t a_u32 = vdupq_n_u32(0); + if (stride == 4) return vld1q_u8(buf); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vld1q_lane_u32(&a, a_u32, 0); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vld1q_lane_u32(&a, a_u32, 1); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vld1q_lane_u32(&a, a_u32, 2); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vld1q_lane_u32(&a, a_u32, 3); + return vreinterpretq_u8_u32(a_u32); +} + +// Store 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, + const uint8x16_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); + if (stride == 4) { + vst1q_u8(buf, a); + return; + } + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3)); +} + +// Load 2 sets of 4 bytes when alignment is guaranteed. +static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) { + uint32x2_t a = vdup_n_u32(0); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + a = vld1_lane_u32((const uint32_t *)buf, a, 0); + buf += stride; + a = vld1_lane_u32((const uint32_t *)buf, a, 1); + return vreinterpret_u8_u32(a); +} + +// Store 2 sets of 4 bytes when alignment is guaranteed. +static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) { + uint32x2_t a_u32 = vreinterpret_u32_u8(a); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + vst1_lane_u32((uint32_t *)buf, a_u32, 0); + buf += stride; + vst1_lane_u32((uint32_t *)buf, a_u32, 1); +} +#endif // VPX_DSP_ARM_MEM_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/quantize_neon.c b/libs/libvpx/vpx_dsp/arm/quantize_neon.c new file mode 100644 index 0000000000..a0a1e6dd5a --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/quantize_neon.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + const int16x8_t one = vdupq_n_s16(1); + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + // Process first 8 values which include a dc component. + { + // Only the first element of each vector is DC. + const int16x8_t zbin = vld1q_s16(zbin_ptr); + const int16x8_t round = vld1q_s16(round_ptr); + const int16x8_t quant = vld1q_s16(quant_ptr); + const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + const int16x8_t dequant = vld1q_s16(dequant_ptr); + // Add one because the eob does not index from 0. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + const int16x8_t zbin_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + qcoeff = vandq_s16(qcoeff, zbin_mask); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + + coeff_ptr += 8; + iscan_ptr += 8; + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + qcoeff_ptr += 8; + + qcoeff = vmulq_s16(qcoeff, dequant); + + store_s16q_to_tran_low(dqcoeff_ptr, qcoeff); + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]); + const int16x8_t round = vdupq_n_s16(round_ptr[1]); + const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); + const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); + const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + + do { + // Add one because the eob is not its index. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + const int16x8_t zbin_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + qcoeff = vandq_s16(qcoeff, zbin_mask); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + + coeff_ptr += 8; + iscan_ptr += 8; + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + qcoeff_ptr += 8; + + qcoeff = vmulq_s16(qcoeff, dequant); + + store_s16q_to_tran_low(dqcoeff_ptr, qcoeff); + dqcoeff_ptr += 8; + + n_coeffs -= 8; + } while (n_coeffs > 0); + } + + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +} + +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +// Main difference is that zbin values are halved before comparison and dqcoeff +// values are divided by 2. zbin is rounded but dqcoeff is not. +void vpx_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const int16x8_t one = vdupq_n_s16(1); + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + (void)scan_ptr; + (void)n_coeffs; // Because we will always calculate 32*32. + (void)skip_block; + assert(!skip_block); + + // Process first 8 values which include a dc component. + { + // Only the first element of each vector is DC. + const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); + const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + const int16x8_t quant = vld1q_s16(quant_ptr); + const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + const int16x8_t dequant = vld1q_s16(dequant_ptr); + // Add one because the eob does not index from 0. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + const int16x8_t zbin_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + int16x8_t dqcoeff; + int32x4_t dqcoeff_0, dqcoeff_1; + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + qcoeff = vandq_s16(qcoeff, zbin_mask); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + + coeff_ptr += 8; + iscan_ptr += 8; + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + qcoeff_ptr += 8; + + dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); + + store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + dqcoeff_ptr += 8; + } + + { + const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1); + const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1); + const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); + const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); + const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + + for (i = 1; i < 32 * 32 / 8; ++i) { + // Add one because the eob is not its index. + const uint16x8_t iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + const int16x8_t zbin_mask = + vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + int16x8_t dqcoeff; + int32x4_t dqcoeff_0, dqcoeff_1; + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + qcoeff = vandq_s16(qcoeff, zbin_mask); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + + coeff_ptr += 8; + iscan_ptr += 8; + + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + qcoeff_ptr += 8; + + dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); + + store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); + dqcoeff_ptr += 8; + } + } + + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +} diff --git a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c index dc20398000..b04de3aff2 100644 --- a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c +++ b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -13,212 +13,230 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, -// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo -// and vec_sum_ref_hi. -static void sad_neon_64(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, - const uint8x16_t vec_src_32, - const uint8x16_t vec_src_48, const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); -} - -// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, -// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. -static void sad_neon_32(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); -} - -void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { +void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; + const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride); + for (i = 0; i < 4; ++i) { + const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); + res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); + } +} - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); +void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride); + const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride); + for (i = 0; i < 4; ++i) { + const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride); + const uint8x16_t ref_1 = + load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0)); + abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0)); + abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1)); + abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1)); + res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); + } +} - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, - &vec_sum_ref0_lo, &vec_sum_ref0_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, - &vec_sum_ref1_lo, &vec_sum_ref1_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, - &vec_sum_ref2_lo, &vec_sum_ref2_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, - &vec_sum_ref3_lo, &vec_sum_ref3_hi); +static INLINE void sad8x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { + int i, j; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(a); + a += a_stride; + for (j = 0; j < 4; ++j) { + const uint8x8_t b_u8 = vld1_u8(b_loop[j]); + b_loop[j] += b_stride; + sum[j] = vabal_u8(sum[j], a_u8, b_u8); + } } - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); + for (j = 0; j < 4; ++j) { + result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + } } -void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; +void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad8x_4d(src, src_stride, ref, ref_stride, res, 4); +} - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); +void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad8x_4d(src, src_stride, ref, ref_stride, res, 8); +} - sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, - &vec_sum_ref0_hi); - sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, - &vec_sum_ref1_hi); - sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, - &vec_sum_ref2_hi); - sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, - &vec_sum_ref3_hi); +void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad8x_4d(src, src_stride, ref, ref_stride, res, 16); +} - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; +static INLINE void sad16x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { + int i, j; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(a); + a += a_stride; + for (j = 0; j < 4; ++j) { + const uint8x16_t b_u8 = vld1q_u8(b_loop[j]); + b_loop[j] += b_stride; + sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8)); + sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8)); + } } - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); + for (j = 0; j < 4; ++j) { + result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + } +} + +void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad16x_4d(src, src_stride, ref, ref_stride, res, 8); } void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; + sad16x_4d(src, src_stride, ref, ref_stride, res, 16); +} - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref0 = vld1q_u8(ref0); - const uint8x16_t vec_ref1 = vld1q_u8(ref1); - const uint8x16_t vec_ref2 = vld1q_u8(ref2); - const uint8x16_t vec_ref3 = vld1q_u8(ref3); +void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad16x_4d(src, src_stride, ref, ref_stride, res, 32); +} - vec_sum_ref0_lo = - vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); - vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref0)); - vec_sum_ref1_lo = - vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); - vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref1)); - vec_sum_ref2_lo = - vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); - vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref2)); - vec_sum_ref3_lo = - vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); - vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref3)); +static INLINE void sad32x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { + int i, j; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + a += a_stride; + for (j = 0; j < 4; ++j) { + const uint8x16_t b_0 = vld1q_u8(b_loop[j]); + const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16); + b_loop[j] += b_stride; + sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0)); + sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0)); + sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1)); + sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1)); + } } - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); + for (j = 0; j < 4; ++j) { + result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + } +} + +void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad32x_4d(src, src_stride, ref, ref_stride, res, 16); +} + +void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad32x_4d(src, src_stride, ref, ref_stride, res, 32); +} + +void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad32x_4d(src, src_stride, ref, ref_stride, res, 64); +} + +static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1, + const uint8x16_t b_0, const uint8x16_t b_1, + uint16x8_t *sum) { + *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0)); + *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0)); + *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1)); + *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1)); +} + +static INLINE void sad64x_4d(const uint8_t *a, int a_stride, + const uint8_t *const b[4], int b_stride, + uint32_t *result, const int height) { + int i; + uint16x8_t sum_0 = vdupq_n_u16(0); + uint16x8_t sum_1 = vdupq_n_u16(0); + uint16x8_t sum_2 = vdupq_n_u16(0); + uint16x8_t sum_3 = vdupq_n_u16(0); + uint16x8_t sum_4 = vdupq_n_u16(0); + uint16x8_t sum_5 = vdupq_n_u16(0); + uint16x8_t sum_6 = vdupq_n_u16(0); + uint16x8_t sum_7 = vdupq_n_u16(0); + const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + const uint8x16_t a_2 = vld1q_u8(a + 32); + const uint8x16_t a_3 = vld1q_u8(a + 48); + a += a_stride; + sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0); + sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48), + &sum_1); + b_loop[0] += b_stride; + sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2); + sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48), + &sum_3); + b_loop[1] += b_stride; + sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4); + sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48), + &sum_5); + b_loop[2] += b_stride; + sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6); + sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48), + &sum_7); + b_loop[3] += b_stride; + } + + result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0); + result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0); + result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0); + result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0); +} + +void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad64x_4d(src, src_stride, ref, ref_stride, res, 32); +} + +void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + sad64x_4d(src, src_stride, ref, ref_stride, res, 64); } diff --git a/libs/libvpx/vpx_dsp/arm/sad_neon.c b/libs/libvpx/vpx_dsp/arm/sad_neon.c index ff3228768c..9518a166bb 100644 --- a/libs/libvpx/vpx_dsp/arm/sad_neon.c +++ b/libs/libvpx/vpx_dsp/arm/sad_neon.c @@ -13,211 +13,332 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; +uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); +} + +uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); + const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); + uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); +} + +uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 15; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); + uint16x8_t abs = vdupq_n_u16(0); + for (i = 0; i < 8; i += 4) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); } - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); } -unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x2_t d1; - uint64x1_t d3; +uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 3; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); + uint16x8_t abs = vdupq_n_u16(0); + for (i = 0; i < 8; i += 4) { + const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); + const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + second_pred += 16; + abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg)); + abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); } - d1 = vpaddl_u16(vget_low_u16(q12)); - d3 = vpaddl_u32(d1); - - return vget_lane_u32(vreinterpret_u32_u64(d3), 0); + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); } -unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x16_t q0, q4; - uint16x8_t q12, q13; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; +static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const int height) { int i; + uint16x8_t abs = vdupq_n_u16(0); - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(a); + const uint8x8_t b_u8 = vld1_u8(b); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, a_u8, b_u8); + } + return abs; +} - for (i = 0; i < 7; i++) { - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); +static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *c, const int height) { + int i; + uint16x8_t abs = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(a); + const uint8x8_t b_u8 = vld1_u8(b); + const uint8x8_t c_u8 = vld1_u8(c); + const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); + a += a_stride; + b += b_stride; + c += 8; + abs = vabal_u8(abs, a_u8, avg); + } + return abs; +} + +#define sad8xN(n) \ + uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ } - q12 = vaddq_u16(q12, q13); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); +sad8xN(4); +sad8xN(8); +sad8xN(16); - return vget_lane_u32(d5, 0); -} - -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} -static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { - const uint32x4_t a = vpaddlq_u16(vec_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); + uint16x8_t abs = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(a); + const uint8x16_t b_u8 = vld1q_u8(b); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); + abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); } - return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); + return abs; } -unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *c, const int height) { int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); + uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(a); + const uint8x16_t b_u8 = vld1q_u8(b); + const uint8x16_t c_u8 = vld1q_u8(c); + const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); + a += a_stride; + b += b_stride; + c += 16; + abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); + abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); + return abs; } -unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +#define sad16xN(n) \ + uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } + +sad16xN(8); +sad16xN(16); +sad16xN(32); + +static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); + uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref = vld1q_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum_lo = - vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); - vec_accum_hi = - vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(a); + const uint8x16_t a_hi = vld1q_u8(a + 16); + const uint8x16_t b_lo = vld1q_u8(b); + const uint8x16_t b_hi = vld1q_u8(b + 16); + a += a_stride; + b += b_stride; + abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); + abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); + abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); + abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi)); } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); + return abs; } -unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { +static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *c, const int height) { int i; - uint16x8_t vec_accum = vdupq_n_u16(0); + uint16x8_t abs = vdupq_n_u16(0); - for (i = 0; i < 8; ++i) { - const uint8x8_t vec_src = vld1_u8(src); - const uint8x8_t vec_ref = vld1_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(a); + const uint8x16_t a_hi = vld1q_u8(a + 16); + const uint8x16_t b_lo = vld1q_u8(b); + const uint8x16_t b_hi = vld1q_u8(b + 16); + const uint8x16_t c_lo = vld1q_u8(c); + const uint8x16_t c_hi = vld1q_u8(c + 16); + const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); + const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); + a += a_stride; + b += b_stride; + c += 32; + abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); + abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); + abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); + abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi)); } - return horizontal_add_16x8(vec_accum); + return abs; } + +#define sad32xN(n) \ + uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } \ + \ + uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint16x8_t abs = \ + sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ + } + +sad32xN(16); +sad32xN(32); +sad32xN(64); + +static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const int height) { + int i; + uint16x8_t abs_0 = vdupq_n_u16(0); + uint16x8_t abs_1 = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + const uint8x16_t a_2 = vld1q_u8(a + 32); + const uint8x16_t a_3 = vld1q_u8(a + 48); + const uint8x16_t b_0 = vld1q_u8(b); + const uint8x16_t b_1 = vld1q_u8(b + 16); + const uint8x16_t b_2 = vld1q_u8(b + 32); + const uint8x16_t b_3 = vld1q_u8(b + 48); + a += a_stride; + b += b_stride; + abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); + abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3)); + } + + { + const uint32x4_t sum = vpaddlq_u16(abs_0); + return vpadalq_u16(sum, abs_1); + } +} + +static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *c, const int height) { + int i; + uint16x8_t abs_0 = vdupq_n_u16(0); + uint16x8_t abs_1 = vdupq_n_u16(0); + + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(a); + const uint8x16_t a_1 = vld1q_u8(a + 16); + const uint8x16_t a_2 = vld1q_u8(a + 32); + const uint8x16_t a_3 = vld1q_u8(a + 48); + const uint8x16_t b_0 = vld1q_u8(b); + const uint8x16_t b_1 = vld1q_u8(b + 16); + const uint8x16_t b_2 = vld1q_u8(b + 32); + const uint8x16_t b_3 = vld1q_u8(b + 48); + const uint8x16_t c_0 = vld1q_u8(c); + const uint8x16_t c_1 = vld1q_u8(c + 16); + const uint8x16_t c_2 = vld1q_u8(c + 32); + const uint8x16_t c_3 = vld1q_u8(c + 48); + const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); + const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); + const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); + const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); + a += a_stride; + b += b_stride; + c += 64; + abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); + abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); + abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2)); + abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3)); + abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3)); + } + + { + const uint32x4_t sum = vpaddlq_u16(abs_0); + return vpadalq_u16(sum, abs_1); + } +} + +#define sad64xN(n) \ + uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ + } \ + \ + uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t abs = \ + sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ + return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ + } + +sad64xN(32); +sad64xN(64); diff --git a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c index f044e11a15..4f58a7832a 100644 --- a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c +++ b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c @@ -12,25 +12,48 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" -#include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/variance.h" +#include "vpx_dsp/arm/mem_neon.h" static const uint8_t bilinear_filters[8][2] = { { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; +// Process a block exactly 4 wide and a multiple of 2 high. +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + const uint8_t *filter) { + const uint8x8_t f0 = vdup_n_u8(filter[0]); + const uint8x8_t f1 = vdup_n_u8(filter[1]); + unsigned int i; + for (i = 0; i < output_height; i += 2) { + const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line); + const uint8x8_t src_1 = + load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(output_ptr, out); + src_ptr += 2 * src_pixels_per_line; + output_ptr += 8; + } +} + +// Process a block exactly 8 wide and any height. static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - const uint8x8_t f0 = vmov_n_u8(filter[0]); - const uint8x8_t f1 = vmov_n_u8(filter[1]); + const uint8x8_t f0 = vdup_n_u8(filter[0]); + const uint8x8_t f1 = vdup_n_u8(filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); @@ -38,13 +61,13 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, const uint16x8_t a = vmull_u8(src_0, f0); const uint16x8_t b = vmlal_u8(a, src_1, f1); const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(&output_ptr[0], out); - // Next row... + vst1_u8(output_ptr, out); src_ptr += src_pixels_per_line; - output_ptr += output_width; + output_ptr += 8; } } +// Process a block which is a mutiple of 16 wide and any height. static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, @@ -52,8 +75,8 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { - const uint8x8_t f0 = vmov_n_u8(filter[0]); - const uint8x8_t f1 = vmov_n_u8(filter[1]); + const uint8x8_t f0 = vdup_n_u8(filter[0]); + const uint8x8_t f1 = vdup_n_u8(filter[1]); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 16) { @@ -65,69 +88,97 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi)); } - // Next row... src_ptr += src_pixels_per_line; output_ptr += output_width; } } -unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); - DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); +// 4xM filter writes an extra row to fdata because it processes two rows at a +// time. +#define sub_pixel_varianceNxM(n, m) \ + uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ + bilinear_filters[yoffset]); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ + bilinear_filters[yoffset]); \ + } else { \ + var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ + bilinear_filters[yoffset]); \ + } \ + return vpx_variance##n##x##m(temp1, n, b, b_stride, sse); \ + } - var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, - bilinear_filters[yoffset]); - return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); -} +sub_pixel_varianceNxM(4, 4); +sub_pixel_varianceNxM(4, 8); +sub_pixel_varianceNxM(8, 4); +sub_pixel_varianceNxM(8, 8); +sub_pixel_varianceNxM(8, 16); +sub_pixel_varianceNxM(16, 8); +sub_pixel_varianceNxM(16, 16); +sub_pixel_varianceNxM(16, 32); +sub_pixel_varianceNxM(32, 16); +sub_pixel_varianceNxM(32, 32); +sub_pixel_varianceNxM(32, 64); +sub_pixel_varianceNxM(64, 32); +sub_pixel_varianceNxM(64, 64); -unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); - DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); +// 4xM filter writes an extra row to fdata because it processes two rows at a +// time. +#define sub_pixel_avg_varianceNxM(n, m) \ + uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ + bilinear_filters[yoffset]); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ + bilinear_filters[yoffset]); \ + } else { \ + var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ + bilinear_filters[yoffset]); \ + } \ + \ + vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ + \ + return vpx_variance##n##x##m(temp0, n, b, b_stride, sse); \ + } - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, - bilinear_filters[yoffset]); - return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); -} - -unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); - DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, - bilinear_filters[yoffset]); - return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); -} - -unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); - DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, - bilinear_filters[yoffset]); - return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); -} +sub_pixel_avg_varianceNxM(4, 4); +sub_pixel_avg_varianceNxM(4, 8); +sub_pixel_avg_varianceNxM(8, 4); +sub_pixel_avg_varianceNxM(8, 8); +sub_pixel_avg_varianceNxM(8, 16); +sub_pixel_avg_varianceNxM(16, 8); +sub_pixel_avg_varianceNxM(16, 16); +sub_pixel_avg_varianceNxM(16, 32); +sub_pixel_avg_varianceNxM(32, 16); +sub_pixel_avg_varianceNxM(32, 32); +sub_pixel_avg_varianceNxM(32, 64); +sub_pixel_avg_varianceNxM(64, 32); +sub_pixel_avg_varianceNxM(64, 64); diff --git a/libs/libvpx/vpx_dsp/arm/sum_neon.h b/libs/libvpx/vpx_dsp/arm/sum_neon.h new file mode 100644 index 0000000000..d74fe0cde4 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/sum_neon.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_SUM_NEON_H_ +#define VPX_DSP_ARM_SUM_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) { + const int32x4_t b = vpaddlq_s16(a); + const int64x2_t c = vpaddlq_s32(b); + return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), + vreinterpret_s32_s64(vget_high_s64(c))); +} + +static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) { + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); +} + +static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a, + const uint16x8_t b) { + const uint32x4_t c = vpaddlq_u16(a); + const uint32x4_t d = vpadalq_u16(c, b); + const uint64x2_t e = vpaddlq_u32(d); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)), + vreinterpret_u32_u64(vget_high_u64(e))); +} + +static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { + const uint64x2_t b = vpaddlq_u32(a); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); +} +#endif // VPX_DSP_ARM_SUM_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/transpose_neon.h b/libs/libvpx/vpx_dsp/arm/transpose_neon.h index 3d0b41f930..d85cbcee46 100644 --- a/libs/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/libs/libvpx/vpx_dsp/arm/transpose_neon.h @@ -21,7 +21,7 @@ // // b0.val[0]: 00 01 02 03 16 17 18 19 // b0.val[1]: 04 05 06 07 20 21 22 23 -static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { +static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); @@ -30,7 +30,23 @@ static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { return b0; } -static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) { +static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { + int32x4x2_t b0; + b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); + b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); + return b0; +} + +static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { + int64x2x2_t b0; + b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)), + vreinterpret_s64_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)), + vreinterpret_s64_s32(vget_high_s32(a1))); + return b0; +} + +static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { uint8x16x2_t b0; b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)), vreinterpret_u8_u32(vget_low_u32(a1))); @@ -39,6 +55,463 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) { return b0; } +static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { + uint16x8x2_t b0; + b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), + vreinterpret_u16_u32(vget_low_u32(a1))); + b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), + vreinterpret_u16_u32(vget_high_u32(a1))); + return b0; +} + +static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const uint16x4x2_t b0 = + vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1)); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 01 20 21 02 03 22 23 + // c0.val[1]: 10 11 30 31 12 13 32 33 + + const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), + vreinterpret_u32_u16(b0.val[1])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + +static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, + int16x4_t *a2, int16x4_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int16x4x2_t b0 = vtrn_s16(*a0, *a1); + const int16x4x2_t b1 = vtrn_s16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + + *a0 = vreinterpret_s16_s32(c0.val[0]); + *a1 = vreinterpret_s16_s32(c1.val[0]); + *a2 = vreinterpret_s16_s32(c0.val[1]); + *a3 = vreinterpret_s16_s32(c1.val[1]); +} + +static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const int32x4x2_t b0 = + vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1)); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 01 20 21 02 03 22 23 + // c0.val[1]: 10 11 30 31 12 13 32 33 + + const int32x4_t c0 = + vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1])); + const int32x4_t c1 = + vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1])); + + // Swap 16 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const int16x8x2_t d0 = + vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1)); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + +static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const uint32x4x2_t b0 = + vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1)); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 01 20 21 02 03 22 23 + // c0.val[1]: 10 11 30 31 12 13 32 33 + + const uint32x4_t c0 = + vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1])); + const uint32x4_t c1 = + vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1])); + + // Swap 16 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const uint16x8x2_t d0 = + vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1)); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + +static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, const uint8x8_t a4, + const uint8x8_t a5, const uint8x8_t a6, + const uint8x8_t a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 XX XX XX XX + // a1: 10 11 12 13 XX XX XX XX + // a2: 20 21 22 23 XX XX XX XX + // a3; 30 31 32 33 XX XX XX XX + // a4: 40 41 42 43 XX XX XX XX + // a5: 50 51 52 53 XX XX XX XX + // a6: 60 61 62 63 XX XX XX XX + // a7: 70 71 72 73 XX XX XX XX + // to: + // b0.val[0]: 00 01 02 03 40 41 42 43 + // b1.val[0]: 10 11 12 13 50 51 52 53 + // b2.val[0]: 20 21 22 23 60 61 62 63 + // b3.val[0]: 30 31 32 33 70 71 72 73 + + const uint32x2x2_t b0 = + vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4)); + const uint32x2x2_t b1 = + vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5)); + const uint32x2x2_t b2 = + vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6)); + const uint32x2x2_t b3 = + vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 01 20 21 40 41 60 61 + // c0.val[1]: 02 03 22 23 42 43 62 63 + // c1.val[0]: 10 11 30 31 50 51 70 71 + // c1.val[1]: 12 13 32 33 52 53 72 73 + + const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), + vreinterpret_u16_u32(b2.val[0])); + const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), + vreinterpret_u16_u32(b3.val[0])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 01 11 21 31 41 51 61 71 + // d1.val[0]: 02 12 22 32 42 52 62 72 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); + const uint8x8x2_t d1 = + vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; + *a2 = d1.val[0]; + *a3 = d1.val[1]; +} + +static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, + int32x4_t *a2, int32x4_t *a3) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + + *a0 = c0.val[0]; + *a1 = c1.val[0]; + *a2 = c0.val[1]; + *a3 = c1.val[1]; +} + +static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, + const int16x4_t a2, const int16x4_t a3, + const int16x4_t a4, const int16x4_t a5, + const int16x4_t a6, const int16x4_t a7, + int16x8_t *const o0, int16x8_t *const o1, + int16x8_t *const o2, int16x8_t *const o3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + const int16x4x2_t b0 = vtrn_s16(a0, a1); + const int16x4x2_t b1 = vtrn_s16(a2, a3); + const int16x4x2_t b2 = vtrn_s16(a4, a5); + const int16x4x2_t b3 = vtrn_s16(a6, a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), + vreinterpret_s32_s16(b3.val[0])); + const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), + vreinterpret_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // o0: 00 10 20 30 40 50 60 70 + // o1: 01 11 21 31 41 51 61 71 + // o2: 02 12 22 32 42 52 62 72 + // o3: 03 13 23 33 43 53 63 73 + + *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), + vreinterpret_s16_s32(c2.val[0])); + *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), + vreinterpret_s16_s32(c3.val[0])); + *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), + vreinterpret_s16_s32(c2.val[1])); + *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), + vreinterpret_s16_s32(c3.val[1])); +} + +static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1, + int32x4_t *const a2, int32x4_t *const a3, + int32x4_t *const a4, int32x4_t *const a5, + int32x4_t *const a6, int32x4_t *const a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + const int32x4x2_t b2 = vtrnq_s32(*a4, *a5); + const int32x4x2_t b3 = vtrnq_s32(*a6, *a7); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]); + const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]); + const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]); + const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]); + + *a0 = vreinterpretq_s32_s64(c0.val[0]); + *a1 = vreinterpretq_s32_s64(c2.val[0]); + *a2 = vreinterpretq_s32_s64(c1.val[0]); + *a3 = vreinterpretq_s32_s64(c3.val[0]); + *a4 = vreinterpretq_s32_s64(c0.val[1]); + *a5 = vreinterpretq_s32_s64(c2.val[1]); + *a6 = vreinterpretq_s32_s64(c1.val[1]); + *a7 = vreinterpretq_s32_s64(c3.val[1]); +} + +static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); + const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const uint16x4x2_t c0 = + vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); + const uint16x4x2_t c1 = + vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); + + *a0 = vreinterpret_u8_u16(c0.val[0]); + *a1 = vreinterpret_u8_u16(c1.val[0]); + *a2 = vreinterpret_u8_u16(c0.val[1]); + *a3 = vreinterpret_u8_u16(c1.val[1]); +} + +static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1, + uint16x8_t *a2, uint16x8_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); + const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + + *a0 = vreinterpretq_u16_u32(c0.val[0]); + *a1 = vreinterpretq_u16_u32(c1.val[0]); + *a2 = vreinterpretq_u16_u32(c0.val[1]); + *a3 = vreinterpretq_u16_u32(c1.val[1]); +} + +static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1, + int32x4_t *const a2, int32x4_t *const a3, + int32x4_t *const a4, int32x4_t *const a5, + int32x4_t *const a6, int32x4_t *const a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 04 05 06 07 + // a2: 10 11 12 13 + // a3: 14 15 16 17 + // a4: 20 21 22 23 + // a5: 24 25 26 27 + // a6: 30 31 32 33 + // a7: 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 04 14 06 16 + // b1.val[1]: 05 15 07 17 + // b2.val[0]: 20 30 22 32 + // b2.val[1]: 21 31 23 33 + // b3.val[0]: 24 34 26 36 + // b3.val[1]: 25 35 27 37 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a2); + const int32x4x2_t b1 = vtrnq_s32(*a1, *a3); + const int32x4x2_t b2 = vtrnq_s32(*a4, *a6); + const int32x4x2_t b3 = vtrnq_s32(*a5, *a7); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 04 14 24 34 + // c2.val[1]: 06 16 26 36 + // c3.val[0]: 05 15 25 35 + // c3.val[1]: 07 17 27 37 + + const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]); + const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]); + const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]); + const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]); + + *a0 = vreinterpretq_s32_s64(c0.val[0]); + *a1 = vreinterpretq_s32_s64(c1.val[0]); + *a2 = vreinterpretq_s32_s64(c0.val[1]); + *a3 = vreinterpretq_s32_s64(c1.val[1]); + *a4 = vreinterpretq_s32_s64(c2.val[0]); + *a5 = vreinterpretq_s32_s64(c3.val[0]); + *a6 = vreinterpretq_s32_s64(c2.val[1]); + *a7 = vreinterpretq_s32_s64(c3.val[1]); +} + // Note: Using 'd' registers or 'q' registers has almost identical speed. We use // 'q' registers here to save some instructions. static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, @@ -151,10 +624,10 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 - const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]); - const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]); - const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]); - const int16x8x2_t d3 = vpx_vtrnq_s64(c1.val[1], c3.val[1]); + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); *a0 = d0.val[0]; *a1 = d1.val[0]; @@ -166,6 +639,154 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, *a7 = d3.val[1]; } +static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, + uint16x8_t *a2, uint16x8_t *a3, + uint16x8_t *a4, uint16x8_t *a5, + uint16x8_t *a6, uint16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); + const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); + const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); + const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), + vreinterpretq_u32_u16(b3.val[0])); + const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), + vreinterpretq_u32_u16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); + const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); + const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); + const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]); + + *a0 = d0.val[0]; + *a1 = d1.val[0]; + *a2 = d2.val[0]; + *a3 = d3.val[0]; + *a4 = d0.val[1]; + *a5 = d1.val[1]; + *a6 = d2.val[1]; + *a7 = d3.val[1]; +} + +static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, + int32x4x2_t *a2, int32x4x2_t *a3, + int32x4x2_t *a4, int32x4x2_t *a5, + int32x4x2_t *a6, int32x4x2_t *a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0: 00 10 02 12 01 11 03 13 + // b1: 20 30 22 32 21 31 23 33 + // b2: 40 50 42 52 41 51 43 53 + // b3: 60 70 62 72 61 71 63 73 + // b4: 04 14 06 16 05 15 07 17 + // b5: 24 34 26 36 25 35 27 37 + // b6: 44 54 46 56 45 55 47 57 + // b7: 64 74 66 76 65 75 67 77 + + const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]); + const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]); + const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]); + const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]); + const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]); + const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]); + const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]); + const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]); + + // Swap 64 bit elements resulting in: + // c0: 00 10 20 30 02 12 22 32 + // c1: 01 11 21 31 03 13 23 33 + // c2: 40 50 60 70 42 52 62 72 + // c3: 41 51 61 71 43 53 63 73 + // c4: 04 14 24 34 06 16 26 36 + // c5: 05 15 25 35 07 17 27 37 + // c6: 44 54 64 74 46 56 66 76 + // c7: 45 55 65 75 47 57 67 77 + const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]); + const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]); + const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]); + const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]); + const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]); + const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]); + + // Swap 128 bit elements resulting in: + // a0: 00 10 20 30 40 50 60 70 + // a1: 01 11 21 31 41 51 61 71 + // a2: 02 12 22 32 42 52 62 72 + // a3: 03 13 23 33 43 53 63 73 + // a4: 04 14 24 34 44 54 64 74 + // a5: 05 15 25 35 45 55 65 75 + // a6: 06 16 26 36 46 56 66 76 + // a7: 07 17 27 37 47 57 67 77 + a0->val[0] = c0.val[0]; + a0->val[1] = c2.val[0]; + a1->val[0] = c1.val[0]; + a1->val[1] = c3.val[0]; + a2->val[0] = c0.val[1]; + a2->val[1] = c2.val[1]; + a3->val[0] = c1.val[1]; + a3->val[1] = c3.val[1]; + a4->val[0] = c4.val[0]; + a4->val[1] = c6.val[0]; + a5->val[0] = c5.val[0]; + a5->val[1] = c7.val[0]; + a6->val[0] = c4.val[1]; + a6->val[1] = c6.val[1]; + a7->val[0] = c5.val[1]; + a7->val[1] = c7.val[1]; +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, @@ -519,14 +1140,14 @@ static INLINE void transpose_u8_16x16( // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF - const uint8x16x2_t e0 = vpx_vtrnq_u64(d0.val[0], d4.val[0]); - const uint8x16x2_t e1 = vpx_vtrnq_u64(d2.val[0], d6.val[0]); - const uint8x16x2_t e2 = vpx_vtrnq_u64(d1.val[0], d5.val[0]); - const uint8x16x2_t e3 = vpx_vtrnq_u64(d3.val[0], d7.val[0]); - const uint8x16x2_t e4 = vpx_vtrnq_u64(d0.val[1], d4.val[1]); - const uint8x16x2_t e5 = vpx_vtrnq_u64(d2.val[1], d6.val[1]); - const uint8x16x2_t e6 = vpx_vtrnq_u64(d1.val[1], d5.val[1]); - const uint8x16x2_t e7 = vpx_vtrnq_u64(d3.val[1], d7.val[1]); + const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]); + const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]); + const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]); + const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]); + const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]); + const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]); + const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]); + const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]); // Output: // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 @@ -563,4 +1184,133 @@ static INLINE void transpose_u8_16x16( *o15 = e7.val[1]; } +static INLINE void load_and_transpose_u8_4x8(const uint8_t *a, + const int a_stride, uint8x8_t *a0, + uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3) { + uint8x8_t a4, a5, a6, a7; + *a0 = vld1_u8(a); + a += a_stride; + *a1 = vld1_u8(a); + a += a_stride; + *a2 = vld1_u8(a); + a += a_stride; + *a3 = vld1_u8(a); + a += a_stride; + a4 = vld1_u8(a); + a += a_stride; + a5 = vld1_u8(a); + a += a_stride; + a6 = vld1_u8(a); + a += a_stride; + a7 = vld1_u8(a); + + transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void load_and_transpose_u8_8x8(const uint8_t *a, + const int a_stride, uint8x8_t *a0, + uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, uint8x8_t *a4, + uint8x8_t *a5, uint8x8_t *a6, + uint8x8_t *a7) { + *a0 = vld1_u8(a); + a += a_stride; + *a1 = vld1_u8(a); + a += a_stride; + *a2 = vld1_u8(a); + a += a_stride; + *a3 = vld1_u8(a); + a += a_stride; + *a4 = vld1_u8(a); + a += a_stride; + *a5 = vld1_u8(a); + a += a_stride; + *a6 = vld1_u8(a); + a += a_stride; + *a7 = vld1_u8(a); + + transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride, + uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x8_t a4, uint8x8_t a5, + uint8x8_t a6, uint8x8_t a7) { + transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + vst1_u8(a, a0); + a += a_stride; + vst1_u8(a, a1); + a += a_stride; + vst1_u8(a, a2); + a += a_stride; + vst1_u8(a, a3); + a += a_stride; + vst1_u8(a, a4); + a += a_stride; + vst1_u8(a, a5); + a += a_stride; + vst1_u8(a, a6); + a += a_stride; + vst1_u8(a, a7); +} + +static INLINE void load_and_transpose_s16_8x8(const int16_t *a, + const int a_stride, int16x8_t *a0, + int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, + int16x8_t *a5, int16x8_t *a6, + int16x8_t *a7) { + *a0 = vld1q_s16(a); + a += a_stride; + *a1 = vld1q_s16(a); + a += a_stride; + *a2 = vld1q_s16(a); + a += a_stride; + *a3 = vld1q_s16(a); + a += a_stride; + *a4 = vld1q_s16(a); + a += a_stride; + *a5 = vld1q_s16(a); + a += a_stride; + *a6 = vld1q_s16(a); + a += a_stride; + *a7 = vld1q_s16(a); + + transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void load_and_transpose_s32_8x8( + const int32_t *a, const int a_stride, int32x4x2_t *const a0, + int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3, + int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6, + int32x4x2_t *const a7) { + a0->val[0] = vld1q_s32(a); + a0->val[1] = vld1q_s32(a + 4); + a += a_stride; + a1->val[0] = vld1q_s32(a); + a1->val[1] = vld1q_s32(a + 4); + a += a_stride; + a2->val[0] = vld1q_s32(a); + a2->val[1] = vld1q_s32(a + 4); + a += a_stride; + a3->val[0] = vld1q_s32(a); + a3->val[1] = vld1q_s32(a + 4); + a += a_stride; + a4->val[0] = vld1q_s32(a); + a4->val[1] = vld1q_s32(a + 4); + a += a_stride; + a5->val[0] = vld1q_s32(a); + a5->val[1] = vld1q_s32(a + 4); + a += a_stride; + a6->val[0] = vld1q_s32(a); + a6->val[1] = vld1q_s32(a + 4); + a += a_stride; + a7->val[0] = vld1q_s32(a); + a7->val[1] = vld1q_s32(a + 4); + + transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} #endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/libs/libvpx/vpx_dsp/arm/variance_neon.c b/libs/libvpx/vpx_dsp/arm/variance_neon.c index b6d7f86a4b..61c2c16a72 100644 --- a/libs/libvpx/vpx_dsp/arm/variance_neon.c +++ b/libs/libvpx/vpx_dsp/arm/variance_neon.c @@ -9,99 +9,204 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); +// The variance helper functions use int16_t for sum. 8 values are accumulated +// and then added (at which point they expand up to int32_t). To avoid overflow, +// there can be no more than 32767 / 255 ~= 128 values accumulated in each +// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32 +// rows = 128. Asserts have been added to each function to warn against reaching +// this limit. + +// Process a block of width 4 four rows at a time. +static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int h, uint32_t *sse, int *sum) { + int i; + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_lo_s32 = vdupq_n_s32(0); + int32x4_t sse_hi_s32 = vdupq_n_s32(0); + + // Since width is only 4, sum_s16 only loads a half row per loop. + assert(h <= 256); + + for (i = 0; i < h; i += 4) { + const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride); + const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride); + const uint16x8_t diff_lo_u16 = + vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); + const uint16x8_t diff_hi_u16 = + vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); + + const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); + const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); + + sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); + sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); + + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), + vget_low_s16(diff_lo_s16)); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), + vget_high_s16(diff_lo_s16)); + + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), + vget_low_s16(diff_hi_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), + vget_high_s16(diff_hi_s16)); + + a += 4 * a_stride; + b += 4 * b_stride; + } + + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); } -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -// w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, - int *sum) { +// Process a block of any size where the width is divisible by 16. +static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, + int *sum) { int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_lo_s32 = vdupq_n_s32(0); + int32x4_t sse_hi_s32 = vdupq_n_s32(0); + + // The loop loads 16 values at a time but doubles them up when accumulating + // into sum_s16. + assert(w / 8 * h <= 128); for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = - vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); - v_sse_hi = - vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); + for (j = 0; j < w; j += 16) { + const uint8x16_t a_u8 = vld1q_u8(a + j); + const uint8x16_t b_u8 = vld1q_u8(b + j); + + const uint16x8_t diff_lo_u16 = + vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); + const uint16x8_t diff_hi_u16 = + vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); + + const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); + const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); + + sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); + sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); + + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), + vget_low_s16(diff_lo_s16)); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), + vget_high_s16(diff_lo_s16)); + + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), + vget_low_s16(diff_hi_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), + vget_high_s16(diff_hi_s16)); } a += a_stride; b += b_stride; } - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); +} + +// Process a block of width 8 two rows at a time. +static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int h, uint32_t *sse, int *sum) { + int i = 0; + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_lo_s32 = vdupq_n_s32(0); + int32x4_t sse_hi_s32 = vdupq_n_s32(0); + + // Each column has it's own accumulator entry in sum_s16. + assert(h <= 128); + + do { + const uint8x8_t a_0_u8 = vld1_u8(a); + const uint8x8_t a_1_u8 = vld1_u8(a + a_stride); + const uint8x8_t b_0_u8 = vld1_u8(b); + const uint8x8_t b_1_u8 = vld1_u8(b + b_stride); + const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); + const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); + const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); + const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16); + sum_s16 = vaddq_s16(sum_s16, diff_0_s16); + sum_s16 = vaddq_s16(sum_s16, diff_1_s16); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16), + vget_low_s16(diff_0_s16)); + sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16), + vget_low_s16(diff_1_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16), + vget_high_s16(diff_0_s16)); + sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), + vget_high_s16(diff_1_s16)); + a += a_stride + a_stride; + b += b_stride + b_stride; + i += 2; + } while (i < h); + + *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); + *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32( + vaddq_s32(sse_lo_s32, sse_hi_s32))), + 0); } void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); + variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum); } void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); + variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum); } -unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - ((sum * sum) >> 6); -} +#define varianceNxM(n, m, shift) \ + unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + if (n == 4) \ + variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum); \ + else if (n == 8) \ + variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum); \ + else \ + variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum); \ + if (n * m < 16 * 16) \ + return *sse - ((sum * sum) >> shift); \ + else \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } -unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); -} - -unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} +varianceNxM(4, 4, 4); +varianceNxM(4, 8, 5); +varianceNxM(8, 4, 5); +varianceNxM(8, 8, 6); +varianceNxM(8, 16, 7); +varianceNxM(16, 8, 7); +varianceNxM(16, 16, 8); +varianceNxM(16, 32, 9); +varianceNxM(32, 16, 9); +varianceNxM(32, 32, 10); unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, - 32, 32, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride), + b_stride, 32, 32, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); @@ -112,9 +217,9 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), + b_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); @@ -126,162 +231,24 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); + variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), + b_stride, 64, 16, &sse2, &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), + b_stride, 64, 16, &sse2, &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), + b_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); } -unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c deleted file mode 100644 index 69cb284005..0000000000 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" - -static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, - int16x4_t dsrc2, int16x4_t dsrc3, - int16x4_t dsrc4, int16x4_t dsrc5, - int16x4_t dsrc6, int16x4_t dsrc7, - int16x8_t q0s16) { - int32x4_t qdst; - int16x4_t d0s16, d1s16; - - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - - qdst = vmull_lane_s16(dsrc0, d0s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); - qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); - return qdst; -} - -void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - int width; - const uint8_t *s; - uint8_t *d; - uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; - uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; - uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - uint16x8x2_t q0x2u16; - uint8x8x2_t d0x2u8, d1x2u8; - uint32x2x2_t d0x2u32; - uint16x4x2_t d0x2u16, d1x2u16; - uint32x4x2_t q0x2u32; - - assert(x_step_q4 == 16); - - q0s16 = vld1q_s16(filter_x); - - src -= 3; // adjust for taps - for (; h > 0; h -= 4) { // loop_horiz_v - s = src; - d24u8 = vld1_u8(s); - s += src_stride; - d25u8 = vld1_u8(s); - s += src_stride; - d26u8 = vld1_u8(s); - s += src_stride; - d27u8 = vld1_u8(s); - - q12u8 = vcombine_u8(d24u8, d25u8); - q13u8 = vcombine_u8(d26u8, d27u8); - - q0x2u16 = - vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); - d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); - d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); - d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); - d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); - d0x2u8 = vtrn_u8(d24u8, d25u8); - d1x2u8 = vtrn_u8(d26u8, d27u8); - - __builtin_prefetch(src + src_stride * 4); - __builtin_prefetch(src + src_stride * 5); - - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); - q10u16 = vmovl_u8(d1x2u8.val[0]); - q11u16 = vmovl_u8(d1x2u8.val[1]); - - src += 7; - d16u16 = vget_low_u16(q8u16); - d17u16 = vget_high_u16(q8u16); - d18u16 = vget_low_u16(q9u16); - d19u16 = vget_high_u16(q9u16); - q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 - q9u16 = vcombine_u16(d17u16, d19u16); - - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz - s = src; - d28u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d29u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d31u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d30u32 = vld1_dup_u32((const uint32_t *)s); - - __builtin_prefetch(src + 64); - - d0x2u16 = - vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); - d1x2u16 = - vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); - d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 - vreinterpret_u8_u16(d1x2u16.val[0])); // d29 - d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 - vreinterpret_u8_u16(d1x2u16.val[1])); // d30 - - __builtin_prefetch(src + 64 + src_stride); - - q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = - vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); - - d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); - d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); - q12u16 = vmovl_u8(d28u8); - q13u16 = vmovl_u8(d29u8); - - __builtin_prefetch(src + 64 + src_stride * 2); - - d = dst; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); - d += dst_stride; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, - d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - __builtin_prefetch(src + 64 + src_stride * 3); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); - d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), - vreinterpret_u32_u16(d0x2u16.val[1])); - d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), - vreinterpret_u8_u32(d0x2u32.val[1])); - - q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); - - q1u8 = vrhaddq_u8(q1u8, q3u8); - - d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); - d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); - - d = dst; - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - - q8u16 = q9u16; - d20s16 = d23s16; - q11u16 = q12u16; - q9u16 = q13u16; - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - } - src += src_stride * 4 - w - 7; - dst += dst_stride * 4 - w; - } - return; -} - -void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t d2u8, d3u8; - uint32x2_t d2u32, d3u32, d6u32, d7u32; - uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; - uint8x16_t q1u8, q3u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - - assert(y_step_q4 == 16); - - src -= src_stride * 3; - q0s16 = vld1q_s16(filter_y); - for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h - s = src; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); - s += src_stride; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); - s += src_stride; - d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); - s += src_stride; - - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); - q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); - q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); - - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d = dst; - for (height = h; height > 0; height -= 4) { // loop_vert - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); - s += src_stride; - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); - s += src_stride; - - q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); - q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); - - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); - d += dst_stride; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); - d -= dst_stride * 3; - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - __builtin_prefetch(s); - __builtin_prefetch(s + src_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, q0s16); - __builtin_prefetch(s + src_stride * 2); - __builtin_prefetch(s + src_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, q0s16); - __builtin_prefetch(d); - __builtin_prefetch(d + dst_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, - d26s16, d27s16, q0s16); - __builtin_prefetch(d + dst_stride * 2); - __builtin_prefetch(d + dst_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - q1u8 = vcombine_u8(d2u8, d3u8); - q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); - - q1u8 = vrhaddq_u8(q1u8, q3u8); - - d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); - d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); - - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - d += dst_stride; - - q8u16 = q10u16; - d18s16 = d22s16; - d19s16 = d24s16; - q10u16 = q13u16; - d22s16 = d25s16; - } - } - return; -} diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm index e279d570fc..1c2ee50630 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm @@ -42,10 +42,11 @@ ; r1 int src_stride ; r2 uint8_t *dst ; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused +; sp[]const int16_t *filter +; sp[]int x0_q4 +; sp[]int x_step_q4 ; unused +; sp[]int y0_q4 +; sp[]int y_step_q4 ; unused ; sp[]int w ; sp[]int h @@ -54,11 +55,11 @@ sub r0, r0, #3 ; adjust for taps - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h + ldrd r4, r5, [sp, #32] ; filter, x0_q4 + add r4, r5, lsl #4 + ldrd r6, r7, [sp, #52] ; w, h - vld1.s16 {q0}, [r5] ; filter_x + vld1.s16 {q0}, [r4] ; filter sub r8, r1, r1, lsl #2 ; -src_stride * 3 add r8, r8, #4 ; -src_stride * 3 + 4 @@ -127,7 +128,7 @@ vpx_convolve8_avg_loop_horiz sub r2, r2, r3, lsl #2 ; reset for store - ; src[] * filter_x + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 @@ -184,11 +185,13 @@ vpx_convolve8_avg_loop_horiz sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h + ldr r4, [sp, #24] ; filter + ldr r5, [sp, #36] ; y0_q4 + add r4, r5, lsl #4 + ldr r6, [sp, #44] ; w + ldr lr, [sp, #48] ; h - vld1.s16 {q0}, [r4] ; filter_y + vld1.s16 {q0}, [r4] ; filter lsl r1, r1, #1 lsl r3, r3, #1 @@ -232,7 +235,7 @@ vpx_convolve8_avg_loop_vert pld [r7] pld [r4] - ; src[] * filter_y + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 pld [r7, r1] diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c index 514525696b..08ae17dbab 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -14,309 +14,946 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_ports/mem.h" -static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, - int16x4_t dsrc2, int16x4_t dsrc3, - int16x4_t dsrc4, int16x4_t dsrc5, - int16x4_t dsrc6, int16x4_t dsrc7, - int16x8_t q0s16) { - int32x4_t qdst; - int16x4_t d0s16, d1s16; +// Note: +// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src). +// 2. After refactoring the shared code in kernel loops with inline functions, +// the decoder speed dropped a lot when using gcc compiler. Therefore there is +// no refactoring for those parts by now. +// 3. For horizontal convolve, there is an alternative optimization that +// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8 +// samples in each are read from memory: src, (src+1), (src+2), (src+3), +// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract +// instructions. This optimization is much faster in speed unit test, but slowed +// down the whole decoder by 5%. - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - - qdst = vmull_lane_s16(dsrc0, d0s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); - qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); - return qdst; +static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); } void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - int width; - const uint8_t *s, *psrc; - uint8_t *d, *pdst; - uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; - uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; - uint8x16_t q12u8, q13u8, q14u8, q15u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - uint16x8x2_t q0x2u16; - uint8x8x2_t d0x2u8, d1x2u8; - uint32x2x2_t d0x2u32; - uint16x4x2_t d0x2u16, d1x2u16; - uint32x4x2_t q0x2u32; + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + uint8x8_t t0, t1, t2, t3; + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); assert(x_step_q4 == 16); - q0s16 = vld1q_s16(filter_x); + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; - src -= 3; // adjust for taps - for (; h > 0; h -= 4, src += src_stride * 4, - dst += dst_stride * 4) { // loop_horiz_v - s = src; - d24u8 = vld1_u8(s); - s += src_stride; - d25u8 = vld1_u8(s); - s += src_stride; - d26u8 = vld1_u8(s); - s += src_stride; - d27u8 = vld1_u8(s); + src -= 3; - q12u8 = vcombine_u8(d24u8, d25u8); - q13u8 = vcombine_u8(d26u8, d27u8); + if (h == 4) { + uint8x8_t d01, d23; + int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, + d1, d2, d3; + int16x8_t tt0, tt1, tt2, tt3; - q0x2u16 = - vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); - d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); - d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); - d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); - d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); - d0x2u8 = vtrn_u8(d24u8, d25u8); - d1x2u8 = vtrn_u8(d26u8, d27u8); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + filter3 = vdup_lane_s16(vget_low_s16(filters), 3); + filter4 = vdup_lane_s16(vget_high_s16(filters), 0); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s0 = vget_low_s16(tt0); + s1 = vget_low_s16(tt1); + s2 = vget_low_s16(tt2); + s3 = vget_low_s16(tt3); + s4 = vget_high_s16(tt0); + s5 = vget_high_s16(tt1); + s6 = vget_high_s16(tt2); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; - __builtin_prefetch(src + src_stride * 4); - __builtin_prefetch(src + src_stride * 5); - __builtin_prefetch(src + src_stride * 6); + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s7 = vget_low_s16(tt0); + s8 = vget_low_s16(tt1); + s9 = vget_low_s16(tt2); + s10 = vget_low_s16(tt3); - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); - q10u16 = vmovl_u8(d1x2u8.val[0]); - q11u16 = vmovl_u8(d1x2u8.val[1]); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); - d16u16 = vget_low_u16(q8u16); - d17u16 = vget_high_u16(q8u16); - d18u16 = vget_low_u16(q9u16); - d19u16 = vget_high_u16(q9u16); - q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 - q9u16 = vcombine_u16(d17u16, d19u16); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); + transpose_u8_4x4(&d01, &d23); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w, psrc = src + 7, pdst = dst; width > 0; - width -= 4, psrc += 4, pdst += 4) { // loop_horiz - s = psrc; - d28u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d29u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d31u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d30u32 = vld1_dup_u32((const uint32_t *)s); + vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), + vreinterpret_u32_u8(d01), 0); + vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), + vreinterpret_u32_u8(d23), 0); + vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), + vreinterpret_u32_u8(d01), 1); + vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), + vreinterpret_u32_u8(d23), 1); - __builtin_prefetch(psrc + 64); + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { + const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); + const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); + int width; + const uint8_t *s; + uint8x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - d0x2u16 = - vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); - d1x2u16 = - vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); - d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 - vreinterpret_u8_u16(d1x2u16.val[0])); // d29 - d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 - vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + if (w == 4) { + do { + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - __builtin_prefetch(psrc + 64 + src_stride); + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = - vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); - d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); - d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); - q12u16 = vmovl_u8(d28u8); - q13u16 = vmovl_u8(d29u8); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1); + dst += dst_stride; + h -= 8; + } while (h > 0); + } else { + uint8_t *d; + int16x8_t s11, s12, s13, s14; - __builtin_prefetch(psrc + 64 + src_stride * 2); + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, - d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, - d27s16, d25s16, q0s16); + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - __builtin_prefetch(psrc + 60 + src_stride * 3); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, + filter4); + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, + filter4); + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, + filter4); + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, + filter3, filter4); - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); - d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), - vreinterpret_u32_u16(d0x2u16.val[1])); - d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), - vreinterpret_u8_u32(d0x2u32.val[1])); - - d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); - d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); - - d = pdst; - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - - q8u16 = q9u16; - d20s16 = d23s16; - q11u16 = q12u16; - q9u16 = q13u16; - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } +} + +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + uint8x8_t t0, t1, t2, t3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (h == 4) { + uint8x8_t d01, d23; + int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, + d1, d2, d3; + int16x8_t tt0, tt1, tt2, tt3; + uint32x4_t d0123 = vdupq_n_u32(0); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + filter3 = vdup_lane_s16(vget_low_s16(filters), 3); + filter4 = vdup_lane_s16(vget_high_s16(filters), 0); + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s0 = vget_low_s16(tt0); + s1 = vget_low_s16(tt1); + s2 = vget_low_s16(tt2); + s3 = vget_low_s16(tt3); + s4 = vget_high_s16(tt0); + s5 = vget_high_s16(tt1); + s6 = vget_high_s16(tt2); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s7 = vget_low_s16(tt0); + s8 = vget_low_s16(tt1); + s9 = vget_low_s16(tt2); + s10 = vget_low_s16(tt3); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); + + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); + transpose_u8_4x4(&d01, &d23); + + d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); + d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); + d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); + d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); + d0123 = vreinterpretq_u32_u8( + vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); + + vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); + vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); + vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); + vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { + const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); + const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); + int width; + const uint8_t *s; + uint8x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + if (w == 4) { + uint32x4_t d0415 = vdupq_n_u32(0); + uint32x4_t d2637 = vdupq_n_u32(0); + do { + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); + + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0); + d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2); + d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0); + d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2); + d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1); + d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3); + d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1); + d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3); + d0415 = vreinterpretq_u32_u8( + vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1))); + d2637 = vreinterpretq_u32_u8( + vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3))); + + vst1q_lane_u32((uint32_t *)dst, d0415, 0); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d0415, 2); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d2637, 0); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d2637, 2); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d0415, 1); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d0415, 3); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d2637, 1); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d2637, 3); + dst += dst_stride; + h -= 8; + } while (h > 0); + } else { + uint8_t *d; + int16x8_t s11, s12, s13, s14; + uint8x16_t d01, d23, d45, d67; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, + filter4); + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, + filter4); + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, + filter4); + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, + filter3, filter4); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), + vld1_u8(d + 1 * dst_stride)); + d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), + vld1_u8(d + 3 * dst_stride)); + d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride), + vld1_u8(d + 5 * dst_stride)); + d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride), + vld1_u8(d + 7 * dst_stride)); + d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1)); + d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3)); + d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5)); + d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7)); + + store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01), + vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45), + vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67)); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); } } - return; } void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - int height; - const uint8_t *s; - uint8_t *d; - uint32x2_t d2u32, d3u32; - uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); assert(y_step_q4 == 16); - src -= src_stride * 3; - q0s16 = vld1q_s16(filter_y); - for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h - s = src; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); - s += src_stride; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); - s += src_stride; - d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); - s += src_stride; + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); - q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); - q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + src -= 3 * src_stride; - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d = dst; - for (height = h; height > 0; height -= 4) { // loop_vert - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + if (w == 4) { + const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); + const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); + uint8x8_t d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); + + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); + dst += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; + } while (h > 0); + } else { + const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); + const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t t0, t1, t2, t3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; + s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + d = dst; + height = h; - q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); - q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + do { + s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); - __builtin_prefetch(d); - __builtin_prefetch(d + dst_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, q0s16); - __builtin_prefetch(d + dst_stride * 2); - __builtin_prefetch(d + dst_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, q0s16); - __builtin_prefetch(s); - __builtin_prefetch(s + src_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, - d26s16, d27s16, q0s16); - __builtin_prefetch(s + src_stride * 2); - __builtin_prefetch(s + src_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, - d27s16, d25s16, q0s16); + vst1_u8(d, t0); + d += dst_stride; + vst1_u8(d, t1); + d += dst_stride; + vst1_u8(d, t2); + d += dst_stride; + vst1_u8(d, t3); + d += dst_stride; - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); - d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); - - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - d += dst_stride; - - q8u16 = q10u16; - d18s16 = d22s16; - d19s16 = d24s16; - q10u16 = q13u16; - d22s16 = d25s16; - } + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); + const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); + uint8x8_t d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + uint32x4_t d0123 = vdupq_n_u32(0); + + s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); + + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); + + d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); + d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1); + d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2); + d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); + d0123 = vreinterpretq_u32_u8( + vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); + + vst1q_lane_u32((uint32_t *)dst, d0123, 0); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d0123, 1); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d0123, 2); + dst += dst_stride; + vst1q_lane_u32((uint32_t *)dst, d0123, 3); + dst += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; + } while (h > 0); + } else { + const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); + const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t t0, t1, t2, t3; + uint8x16_t d01, d23, dd01, dd23; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + d = dst; + height = h; + + do { + s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + s += src_stride; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, + filter4); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, + filter4); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, + filter4); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, + filter4); + + d01 = vcombine_u8(t0, t1); + d23 = vcombine_u8(t2, t3); + dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), + vld1_u8(d + 1 * dst_stride)); + dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), + vld1_u8(d + 3 * dst_stride)); + dd01 = vrhaddq_u8(dd01, d01); + dd23 = vrhaddq_u8(dd23, d23); + + vst1_u8(d, vget_low_u8(dd01)); + d += dst_stride; + vst1_u8(d, vget_high_u8(dd01)); + d += dst_stride; + vst1_u8(d, vget_low_u8(dd23)); + d += dst_stride; + vst1_u8(d, vget_high_u8(dd23)); + d += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); } - return; } diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h new file mode 100644 index 0000000000..c1634ed55f --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filters, + const int16x4_t filter3, + const int16x4_t filter4) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int16x4_t sum; + + sum = vmul_lane_s16(s0, filters_lo, 0); + sum = vmla_lane_s16(sum, s1, filters_lo, 1); + sum = vmla_lane_s16(sum, s2, filters_lo, 2); + sum = vmla_lane_s16(sum, s5, filters_hi, 1); + sum = vmla_lane_s16(sum, s6, filters_hi, 2); + sum = vmla_lane_s16(sum, s7, filters_hi, 3); + sum = vqadd_s16(sum, vmul_s16(s3, filter3)); + sum = vqadd_s16(sum, vmul_s16(s4, filter4)); + return sum; +} + +static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters, + const int16x8_t filter3, + const int16x8_t filter4) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int16x8_t sum; + + sum = vmulq_lane_s16(s0, filters_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filters_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filters_lo, 2); + sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); + sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); + sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); + return vqrshrun_n_s16(sum, 7); +} + +static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, + const int16x8_t filters) { + const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); + const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); + int16x8_t ss[8]; + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4])); + ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5])); + ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6])); + ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7])); + + return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], + filters, filter3, filter4); +} diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm index 2d0f2ae065..5eee15664d 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm @@ -42,10 +42,11 @@ ; r1 int src_stride ; r2 uint8_t *dst ; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused +; sp[]const int16_t *filter +; sp[]int x0_q4 +; sp[]int x_step_q4 ; unused +; sp[]int y0_q4 +; sp[]int y_step_q4 ; unused ; sp[]int w ; sp[]int h @@ -54,11 +55,11 @@ sub r0, r0, #3 ; adjust for taps - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h + ldrd r4, r5, [sp, #32] ; filter, x0_q4 + add r4, r5, lsl #4 + ldrd r6, r7, [sp, #52] ; w, h - vld1.s16 {q0}, [r5] ; filter_x + vld1.s16 {q0}, [r4] ; filter sub r8, r1, r1, lsl #2 ; -src_stride * 3 add r8, r8, #4 ; -src_stride * 3 + 4 @@ -119,7 +120,7 @@ vpx_convolve8_loop_horiz pld [r5, r1, lsl #1] - ; src[] * filter_x + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 @@ -173,11 +174,13 @@ vpx_convolve8_loop_horiz sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h + ldr r4, [sp, #24] ; filter + ldr r5, [sp, #36] ; y0_q4 + add r4, r5, lsl #4 + ldr r6, [sp, #44] ; w + ldr lr, [sp, #48] ; h - vld1.s16 {q0}, [r4] ; filter_y + vld1.s16 {q0}, [r4] ; filter lsl r1, r1, #1 lsl r3, r3, #1 @@ -216,7 +219,7 @@ vpx_convolve8_loop_vert pld [r5] pld [r8] - ; src[] * filter_y + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 pld [r5, r3] diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c index abc2511ea2..07349d03ae 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -13,132 +13,127 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_convolve_avg_neon(const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - uint8_t *d; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint32x2_t d0u32, d2u32; - uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; - d = dst; - if (w > 32) { // avg64 - for (; h > 0; h -= 1) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); + if (w < 8) { // avg4 + uint8x8_t s0, s1; + uint8x8_t dd0 = vdup_n_u8(0); + uint32x2x2_t s01; + do { + s0 = vld1_u8(src); src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); - q10u8 = vld1q_u8(d + 32); - q11u8 = vld1q_u8(d + 48); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q8u8); - q1u8 = vrhaddq_u8(q1u8, q9u8); - q2u8 = vrhaddq_u8(q2u8, q10u8); - q3u8 = vrhaddq_u8(q3u8, q11u8); - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - vst1q_u8(dst + 32, q2u8); - vst1q_u8(dst + 48, q3u8); - dst += dst_stride; - } - } else if (w == 32) { // avg32 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); + s1 = vld1_u8(src); src += src_stride; - q2u8 = vld1q_u8(src); - q3u8 = vld1q_u8(src + 16); - src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); - d += dst_stride; - q10u8 = vld1q_u8(d); - q11u8 = vld1q_u8(d + 16); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q8u8); - q1u8 = vrhaddq_u8(q1u8, q9u8); - q2u8 = vrhaddq_u8(q2u8, q10u8); - q3u8 = vrhaddq_u8(q3u8, q11u8); - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); + s01 = vzip_u32(vreinterpret_u32_u8(s0), vreinterpret_u32_u8(s1)); + dd0 = vreinterpret_u8_u32( + vld1_lane_u32((const uint32_t *)dst, vreinterpret_u32_u8(dd0), 0)); + dd0 = vreinterpret_u8_u32(vld1_lane_u32( + (const uint32_t *)(dst + dst_stride), vreinterpret_u32_u8(dd0), 1)); + dd0 = vrhadd_u8(vreinterpret_u8_u32(s01.val[0]), dd0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 0); dst += dst_stride; - vst1q_u8(dst, q2u8); - vst1q_u8(dst + 16, q3u8); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1); dst += dst_stride; - } - } else if (w > 8) { // avg16 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(src); - src += src_stride; - q2u8 = vld1q_u8(d); - d += dst_stride; - q3u8 = vld1q_u8(d); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q2u8); - q1u8 = vrhaddq_u8(q1u8, q3u8); - - vst1q_u8(dst, q0u8); - dst += dst_stride; - vst1q_u8(dst, q1u8); - dst += dst_stride; - } + h -= 2; + } while (h > 0); } else if (w == 8) { // avg8 - for (; h > 0; h -= 2) { - d0u8 = vld1_u8(src); + uint8x8_t s0, s1, d0, d1; + uint8x16_t s01, d01; + do { + s0 = vld1_u8(src); src += src_stride; - d1u8 = vld1_u8(src); + s1 = vld1_u8(src); src += src_stride; - d2u8 = vld1_u8(d); - d += dst_stride; - d3u8 = vld1_u8(d); - d += dst_stride; + d0 = vld1_u8(dst); + d1 = vld1_u8(dst + dst_stride); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - q0u8 = vrhaddq_u8(q0u8, q1u8); + s01 = vcombine_u8(s0, s1); + d01 = vcombine_u8(d0, d1); + d01 = vrhaddq_u8(s01, d01); - vst1_u8(dst, vget_low_u8(q0u8)); + vst1_u8(dst, vget_low_u8(d01)); dst += dst_stride; - vst1_u8(dst, vget_high_u8(q0u8)); + vst1_u8(dst, vget_high_u8(d01)); dst += dst_stride; - } - } else { // avg4 - for (; h > 0; h -= 2) { - d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); + h -= 2; + } while (h > 0); + } else if (w < 32) { // avg16 + uint8x16_t s0, s1, d0, d1; + do { + s0 = vld1q_u8(src); src += src_stride; - d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); + s1 = vld1q_u8(src); src += src_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); - d += dst_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); - d += dst_stride; + d0 = vld1q_u8(dst); + d1 = vld1q_u8(dst + dst_stride); - d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32)); + d0 = vrhaddq_u8(s0, d0); + d1 = vrhaddq_u8(s1, d1); - d0u32 = vreinterpret_u32_u8(d0u8); - vst1_lane_u32((uint32_t *)dst, d0u32, 0); + vst1q_u8(dst, d0); dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, d0u32, 1); + vst1q_u8(dst, d1); dst += dst_stride; - } + h -= 2; + } while (h > 0); + } else if (w == 32) { // avg32 + uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u8(src); + s1 = vld1q_u8(src + 16); + src += src_stride; + s2 = vld1q_u8(src); + s3 = vld1q_u8(src + 16); + src += src_stride; + d0 = vld1q_u8(dst); + d1 = vld1q_u8(dst + 16); + d2 = vld1q_u8(dst + dst_stride); + d3 = vld1q_u8(dst + dst_stride + 16); + + d0 = vrhaddq_u8(s0, d0); + d1 = vrhaddq_u8(s1, d1); + d2 = vrhaddq_u8(s2, d2); + d3 = vrhaddq_u8(s3, d3); + + vst1q_u8(dst, d0); + vst1q_u8(dst + 16, d1); + dst += dst_stride; + vst1q_u8(dst, d2); + vst1q_u8(dst + 16, d3); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // avg64 + uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u8(src); + s1 = vld1q_u8(src + 16); + s2 = vld1q_u8(src + 32); + s3 = vld1q_u8(src + 48); + src += src_stride; + d0 = vld1q_u8(dst); + d1 = vld1q_u8(dst + 16); + d2 = vld1q_u8(dst + 32); + d3 = vld1q_u8(dst + 48); + + d0 = vrhaddq_u8(s0, d0); + d1 = vrhaddq_u8(s1, d1); + d2 = vrhaddq_u8(s2, d2); + d3 = vrhaddq_u8(s3, d3); + + vst1q_u8(dst, d0); + vst1q_u8(dst + 16, d1); + vst1q_u8(dst + 32, d2); + vst1q_u8(dst + 48, d3); + dst += dst_stride; + } while (--h); } - return; } diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm index 97e6189fda..efd6574f1f 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm @@ -17,7 +17,7 @@ |vpx_convolve_avg_neon| PROC push {r4-r6, lr} - ldrd r4, r5, [sp, #32] + ldrd r4, r5, [sp, #36] mov r6, r2 cmp r4, #32 diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c index fec189e0e4..7abed67a40 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -13,80 +13,87 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_convolve_copy_neon(const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - uint8x8_t d0u8, d2u8; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; - if (w > 32) { // copy64 - for (; h > 0; h--) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); - src += src_stride; - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - vst1q_u8(dst + 32, q2u8); - vst1q_u8(dst + 48, q3u8); - dst += dst_stride; - } - } else if (w == 32) { // copy32 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - src += src_stride; - q2u8 = vld1q_u8(src); - q3u8 = vld1q_u8(src + 16); - src += src_stride; - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - dst += dst_stride; - vst1q_u8(dst, q2u8); - vst1q_u8(dst + 16, q3u8); - dst += dst_stride; - } - } else if (w > 8) { // copy16 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(src); - src += src_stride; - - vst1q_u8(dst, q0u8); - dst += dst_stride; - vst1q_u8(dst, q1u8); - dst += dst_stride; - } - } else if (w == 8) { // copy8 - for (; h > 0; h -= 2) { - d0u8 = vld1_u8(src); - src += src_stride; - d2u8 = vld1_u8(src); - src += src_stride; - - vst1_u8(dst, d0u8); - dst += dst_stride; - vst1_u8(dst, d2u8); - dst += dst_stride; - } - } else { // copy4 - for (; h > 0; h--) { + if (w < 8) { // copy4 + do { *(uint32_t *)dst = *(const uint32_t *)src; src += src_stride; dst += dst_stride; - } + *(uint32_t *)dst = *(const uint32_t *)src; + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 8) { // copy8 + uint8x8_t s0, s1; + do { + s0 = vld1_u8(src); + src += src_stride; + s1 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, s0); + dst += dst_stride; + vst1_u8(dst, s1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w < 32) { // copy16 + uint8x16_t s0, s1; + do { + s0 = vld1q_u8(src); + src += src_stride; + s1 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, s0); + dst += dst_stride; + vst1q_u8(dst, s1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 32) { // copy32 + uint8x16_t s0, s1, s2, s3; + do { + s0 = vld1q_u8(src); + s1 = vld1q_u8(src + 16); + src += src_stride; + s2 = vld1q_u8(src); + s3 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, s0); + vst1q_u8(dst + 16, s1); + dst += dst_stride; + vst1q_u8(dst, s2); + vst1q_u8(dst + 16, s3); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // copy64 + uint8x16_t s0, s1, s2, s3; + do { + s0 = vld1q_u8(src); + s1 = vld1q_u8(src + 16); + s2 = vld1q_u8(src + 32); + s3 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, s0); + vst1q_u8(dst + 16, s1); + vst1q_u8(dst + 32, s2); + vst1q_u8(dst + 48, s3); + dst += dst_stride; + } while (--h); } - return; } diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm index 89164ad48b..7a66e3ce2f 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm @@ -17,7 +17,7 @@ |vpx_convolve_copy_neon| PROC push {r4-r5, lr} - ldrd r4, r5, [sp, #28] + ldrd r4, r5, [sp, #32] cmp r4, #32 bgt copy64 diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c index c2d5895b71..2bf2d890be 100644 --- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c +++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -15,41 +15,40 @@ #include "vpx_ports/mem.h" void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). */ - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + uint8_t temp[64 * 72]; // Account for the vertical phase needing 3 lines prior and 4 lines post - int intermediate_height = h + 7; + const int intermediate_height = h + 7; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* Filter starting 3 lines back. The neon implementation will ignore the - * given height and filter a multiple of 4 lines. Since this goes in to - * the temp buffer which has lots of extra room and is subsequently discarded - * this is safe if somewhat less than ideal. - */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, + /* Filter starting 3 lines back. The neon implementation will ignore the given + * height and filter a multiple of 4 lines. Since this goes in to the temp + * buffer which has lots of extra room and is subsequently discarded this is + * safe if somewhat less than ideal. */ + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); - int intermediate_height = h + 7; + uint8_t temp[64 * 72]; + const int intermediate_height = h + 7; assert(y_step_q4 == 16); assert(x_step_q4 == 16); @@ -57,9 +56,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height); - vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } diff --git a/libs/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c new file mode 100644 index 0000000000..8edf8a66e6 --- /dev/null +++ b/libs/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_ports/mem.h" + +static INLINE void scaledconvolve_horiz_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + + src -= SUBPEL_TAPS / 2 - 1; + + y = h; + do { + int x_q4 = x0_q4; + x = 0; + do { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); + const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); + uint8x8_t s[8], d; + int16x8_t ss[4]; + int16x4_t t[8], tt; + + load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); + transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + t[0] = vget_low_s16(ss[0]); + t[1] = vget_low_s16(ss[1]); + t[2] = vget_low_s16(ss[2]); + t[3] = vget_low_s16(ss[3]); + t[4] = vget_high_s16(ss[0]); + t[5] = vget_high_s16(ss[1]); + t[6] = vget_high_s16(ss[2]); + t[7] = vget_high_s16(ss[3]); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], + filters, filter3, filter4); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + { + const uint8x8x4_t d4 = vld4_u8(temp); + vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], + vreinterpret_u32_u8(d4.val[0]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], + vreinterpret_u32_u8(d4.val[1]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], + vreinterpret_u32_u8(d4.val[2]), 0); + vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], + vreinterpret_u32_u8(d4.val[3]), 0); + } + x += 4; + } while (x < w); + + src += src_stride * 4; + dst += dst_stride * 4; + y -= 4; + } while (y > 0); +} + +static INLINE void scaledconvolve_horiz_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = (h + 7) & ~7; + + do { + int x_q4 = x0_q4; + x = 0; + do { + uint8x8_t d[8]; + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + uint8x8_t s[8]; + load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], + &s[5], &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + d[0] = scale_filter_8(s, filters); + vst1_u8(&temp[8 * z], d[0]); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + vst1_u8(&dst[x + 0 * dst_stride], d[0]); + vst1_u8(&dst[x + 1 * dst_stride], d[1]); + vst1_u8(&dst[x + 2 * dst_stride], d[2]); + vst1_u8(&dst[x + 3 * dst_stride], d[3]); + vst1_u8(&dst[x + 4 * dst_stride], d[4]); + vst1_u8(&dst[x + 5 * dst_stride], d[5]); + vst1_u8(&dst[x + 6 * dst_stride], d[6]); + vst1_u8(&dst[x + 7 * dst_stride], d[7]); + x += 8; + } while (x < w); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static INLINE void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); + const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); + uint8x8_t s[8], d; + int16x4_t t[8], tt; + + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); + t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); + t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); + t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); + t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); + t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); + t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); + t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, + filter3, filter4); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + d = scale_filter_8(s, filters); + vst1_u8(dst, d); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int x, y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + x = 0; + do { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x16_t ss[8]; + uint8x8_t s[8], d[2]; + load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], + &ss[5], &ss[6], &ss[7]); + s[0] = vget_low_u8(ss[0]); + s[1] = vget_low_u8(ss[1]); + s[2] = vget_low_u8(ss[2]); + s[3] = vget_low_u8(ss[3]); + s[4] = vget_low_u8(ss[4]); + s[5] = vget_low_u8(ss[5]); + s[6] = vget_low_u8(ss[6]); + s[7] = vget_low_u8(ss[7]); + d[0] = scale_filter_8(s, filters); + + s[0] = vget_high_u8(ss[0]); + s[1] = vget_high_u8(ss[1]); + s[2] = vget_high_u8(ss[2]); + s[3] = vget_high_u8(ss[3]); + s[4] = vget_high_u8(ss[4]); + s[5] = vget_high_u8(ss[5]); + s[6] = vget_high_u8(ss[6]); + s[7] = vget_high_u8(ss[7]); + d[1] = scale_filter_8(s, filters); + vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); + src_y += 16; + x += 16; + } while (x < w); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } +} diff --git a/libs/libvpx/vpx_dsp/avg.c b/libs/libvpx/vpx_dsp/avg.c index 4d9abb8de3..a7ac6d9538 100644 --- a/libs/libvpx/vpx_dsp/avg.c +++ b/libs/libvpx/vpx_dsp/avg.c @@ -34,7 +34,7 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) { // src_diff: first pass, 9 bit, dynamic range [-255, 255] // second pass, 12 bit, dynamic range [-2040, 2040] -static void hadamard_col8(const int16_t *src_diff, int src_stride, +static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; @@ -66,10 +66,11 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride, // The order of the output coeff of the hadamard is not important. For // optimization purposes the final transpose may be skipped. -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int idx; int16_t buffer[64]; + int16_t buffer2[64]; int16_t *tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit @@ -80,17 +81,19 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { - hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit - // dynamic range [-2040, 2040] - coeff += 8; // coeff: 15 bit - // dynamic range [-16320, 16320] + hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] ++tmp_buf; } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; } // In place 16x16 2D Hadamard transform -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] @@ -101,15 +104,15 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, // coeff: 15 bit, dynamic range [-16320, 16320] for (idx = 0; idx < 64; ++idx) { - int16_t a0 = coeff[0]; - int16_t a1 = coeff[64]; - int16_t a2 = coeff[128]; - int16_t a3 = coeff[192]; + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; - int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] - int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range - int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] - int16_t b3 = (a2 - a3) >> 1; + tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 1; coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] coeff[64] = b1 + b3; @@ -122,7 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. -int vpx_satd_c(const int16_t *coeff, int length) { +int vpx_satd_c(const tran_low_t *coeff, int length) { int i; int satd = 0; for (i = 0; i < length; ++i) satd += abs(coeff[i]); diff --git a/libs/libvpx/vpx_dsp/bitreader_buffer.c b/libs/libvpx/vpx_dsp/bitreader_buffer.c index e99fffb605..3e16bfa38c 100644 --- a/libs/libvpx/vpx_dsp/bitreader_buffer.c +++ b/libs/libvpx/vpx_dsp/bitreader_buffer.c @@ -40,11 +40,5 @@ int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) { } int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) { -#if CONFIG_MISC_FIXES - const int nbits = sizeof(unsigned) * 8 - bits - 1; - const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits; - return ((int)value) >> nbits; -#else return vpx_rb_read_signed_literal(rb, bits); -#endif } diff --git a/libs/libvpx/vpx_dsp/bitwriter_buffer.c b/libs/libvpx/vpx_dsp/bitwriter_buffer.c index 1043cdc784..7a7e96f02e 100644 --- a/libs/libvpx/vpx_dsp/bitwriter_buffer.c +++ b/libs/libvpx/vpx_dsp/bitwriter_buffer.c @@ -38,10 +38,6 @@ void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) { void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data, int bits) { -#if CONFIG_MISC_FIXES - vpx_wb_write_literal(wb, data, bits + 1); -#else vpx_wb_write_literal(wb, abs(data), bits); vpx_wb_write_bit(wb, data < 0); -#endif } diff --git a/libs/libvpx/vpx_dsp/deblock.c b/libs/libvpx/vpx_dsp/deblock.c index 589b124e26..94acbb3919 100644 --- a/libs/libvpx/vpx_dsp/deblock.c +++ b/libs/libvpx/vpx_dsp/deblock.c @@ -7,7 +7,9 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include #include +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" const int16_t vpx_rv[] = { @@ -48,6 +50,9 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, unsigned char v; unsigned char d[4]; + assert(size >= 8); + assert(cols >= 8); + for (row = 0; row < size; row++) { /* post_proc_down for one row */ p_src = src_ptr; @@ -117,7 +122,7 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, unsigned char d[16]; for (r = 0; r < rows; r++) { - int sumsq = 0; + int sumsq = 16; int sum = 0; for (i = -8; i < 0; i++) s[i] = s[0]; @@ -156,14 +161,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit) { int r, c, i; - const int16_t *rv3 = &vpx_rv[63 & rand()]; for (c = 0; c < cols; c++) { unsigned char *s = &dst[c]; int sumsq = 0; int sum = 0; unsigned char d[16]; - const int16_t *rv2 = rv3 + ((c * 17) & 127); for (i = -8; i < 0; i++) s[i * pitch] = s[0]; @@ -183,7 +186,7 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, d[r & 15] = s[0]; if (sumsq * 15 - sum * sum < flimit) { - d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; + d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4; } if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15]; s += pitch; diff --git a/libs/libvpx/vpx_dsp/fastssim.c b/libs/libvpx/vpx_dsp/fastssim.c index 4d5eb5a6ff..0469071a17 100644 --- a/libs/libvpx/vpx_dsp/fastssim.c +++ b/libs/libvpx/vpx_dsp/fastssim.c @@ -202,6 +202,7 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { if (bit_depth == 12) ssim_c1 = SSIM_C1_12; #else assert(bit_depth == 8); + (void)bit_depth; #endif w = _ctx->level[_l].w; h = _ctx->level[_l].h; @@ -326,6 +327,7 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { if (bit_depth == 12) ssim_c2 = SSIM_C2_12; #else assert(bit_depth == 8); + (void)bit_depth; #endif w = _ctx->level[_l].w; diff --git a/libs/libvpx/vpx_dsp/fwd_txfm.c b/libs/libvpx/vpx_dsp/fwd_txfm.c index 4e7d4053ea..6dcb3ba668 100644 --- a/libs/libvpx/vpx_dsp/fwd_txfm.c +++ b/libs/libvpx/vpx_dsp/fwd_txfm.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/fwd_txfm.h" @@ -21,36 +22,37 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { int pass; // We need an intermediate buffer between passes. tran_low_t intermediate[4 * 4]; - const int16_t *in_pass0 = input; - const tran_low_t *in = NULL; + const tran_low_t *in_low = NULL; tran_low_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { - tran_high_t input[4]; // canbe16 + tran_high_t in_high[4]; // canbe16 tran_high_t step[4]; // canbe16 tran_high_t temp1, temp2; // needs32 int i; for (i = 0; i < 4; ++i) { // Load inputs. - if (0 == pass) { - input[0] = in_pass0[0 * stride] * 16; - input[1] = in_pass0[1 * stride] * 16; - input[2] = in_pass0[2 * stride] * 16; - input[3] = in_pass0[3 * stride] * 16; - if (i == 0 && input[0]) { - input[0] += 1; + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; } } else { - input[0] = in[0 * 4]; - input[1] = in[1 * 4]; - input[2] = in[2 * 4]; - input[3] = in[3 * 4]; + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; } // Transform. - step[0] = input[0] + input[3]; - step[1] = input[1] + input[2]; - step[2] = input[1] - input[2]; - step[3] = input[0] - input[3]; + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; temp1 = (step[0] + step[1]) * cospi_16_64; temp2 = (step[0] - step[1]) * cospi_16_64; out[0] = (tran_low_t)fdct_round_shift(temp1); @@ -60,12 +62,11 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { out[1] = (tran_low_t)fdct_round_shift(temp1); out[3] = (tran_low_t)fdct_round_shift(temp2); // Do next column (which is a transposed row in second/horizontal pass) - in_pass0++; - in++; + ++input; out += 4; } // Setup in/out for next pass. - in = intermediate; + in_low = intermediate; out = output; } @@ -83,7 +84,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { for (r = 0; r < 4; ++r) for (c = 0; c < 4; ++c) sum += input[r * stride + c]; - output[0] = sum << 1; + output[0] = sum * 2; } void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { @@ -99,7 +100,6 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { tran_high_t t0, t1, t2, t3; // needs32 tran_high_t x0, x1, x2, x3; // canbe16 - int i; for (i = 0; i < 8; i++) { // stage 1 if (pass == 0) { @@ -190,56 +190,57 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { int pass; // We need an intermediate buffer between passes. tran_low_t intermediate[256]; - const int16_t *in_pass0 = input; - const tran_low_t *in = NULL; + const tran_low_t *in_low = NULL; tran_low_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { tran_high_t step1[8]; // canbe16 tran_high_t step2[8]; // canbe16 tran_high_t step3[8]; // canbe16 - tran_high_t input[8]; // canbe16 + tran_high_t in_high[8]; // canbe16 tran_high_t temp1, temp2; // needs32 int i; for (i = 0; i < 16; i++) { if (0 == pass) { // Calculate input for the first 8 results. - input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4; - input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4; - input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4; - input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; - input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; - input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; - input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4; - input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4; + in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; + in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; + in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; + in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; + in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; + in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; + in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; + in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; // Calculate input for the next 8 results. - step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4; - step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4; - step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; - step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; - step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; - step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4; - step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4; - step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4; + step1[0] = (input[7 * stride] - input[8 * stride]) * 4; + step1[1] = (input[6 * stride] - input[9 * stride]) * 4; + step1[2] = (input[5 * stride] - input[10 * stride]) * 4; + step1[3] = (input[4 * stride] - input[11 * stride]) * 4; + step1[4] = (input[3 * stride] - input[12 * stride]) * 4; + step1[5] = (input[2 * stride] - input[13 * stride]) * 4; + step1[6] = (input[1 * stride] - input[14 * stride]) * 4; + step1[7] = (input[0 * stride] - input[15 * stride]) * 4; } else { // Calculate input for the first 8 results. - input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); - input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); - input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); - input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); - input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); - input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); - input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2); - input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2); + assert(in_low != NULL); + in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); + in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); + in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); + in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); + in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); + in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); + in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); + in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); // Calculate input for the next 8 results. - step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2); - step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2); - step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); - step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); - step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); - step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); - step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); - step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); + step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); + step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); + step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); + step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); + step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); + step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); + step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); + step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); + in_low++; } // Work on the first eight values; fdct8(input, even_results); { @@ -248,14 +249,14 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { tran_high_t x0, x1, x2, x3; // canbe16 // stage 1 - s0 = input[0] + input[7]; - s1 = input[1] + input[6]; - s2 = input[2] + input[5]; - s3 = input[3] + input[4]; - s4 = input[3] - input[4]; - s5 = input[2] - input[5]; - s6 = input[1] - input[6]; - s7 = input[0] - input[7]; + s0 = in_high[0] + in_high[7]; + s1 = in_high[1] + in_high[6]; + s2 = in_high[2] + in_high[5]; + s3 = in_high[3] + in_high[4]; + s4 = in_high[3] - in_high[4]; + s5 = in_high[2] - in_high[5]; + s6 = in_high[1] - in_high[6]; + s7 = in_high[0] - in_high[7]; // fdct4(step, step); x0 = s0 + s3; @@ -350,12 +351,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { out[15] = (tran_low_t)fdct_round_shift(temp2); } // Do next column (which is a transposed row in second/horizontal pass) - in++; - in_pass0++; + input++; out += 16; } // Setup in/out for next pass. - in = intermediate; + in_low = intermediate; out = output; } } diff --git a/libs/libvpx/vpx_dsp/intrapred.c b/libs/libvpx/vpx_dsp/intrapred.c index eca17a983e..400e632e98 100644 --- a/libs/libvpx/vpx_dsp/intrapred.c +++ b/libs/libvpx/vpx_dsp/intrapred.c @@ -42,23 +42,6 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; } -#if CONFIG_MISC_FIXES -static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - (void)above; - - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], - left[(c >> 1) + r + 2]) - : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); - } - dst += stride; - } -} -#endif // CONFIG_MISC_FIXES - static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; @@ -76,22 +59,6 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } -#if CONFIG_MISC_FIXES -static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - (void)left; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], - above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); - } - dst += stride; - } -} -#endif // CONFIG_MISC_FIXES - static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { const uint8_t above_right = above[bs - 1]; @@ -111,21 +78,6 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } -#if CONFIG_MISC_FIXES -static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - (void)left; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = AVG3(above[r + c], above[r + c + 1], - above[r + c + 1 + (r + c + 2 < bs * 2)]); - } - dst += stride; - } -} -#endif // CONFIG_MISC_FIXES - static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; @@ -367,7 +319,7 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, DST(3, 3) = AVG3(E, F, G); // differs from vp8 } -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const int A = above[0]; const int B = above[1]; @@ -533,75 +485,46 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, } } -#if CONFIG_MISC_FIXES -static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)above; - (void)bd; - - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], - left[(c >> 1) + r + 2]) - : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); - } - dst += stride; - } -} -#endif // CONFIG_MISC_FIXES - static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; + int size; (void)left; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], - above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); - } - dst += stride; + for (c = 0; c < bs; ++c) { + dst[c] = AVG2(above[c], above[c + 1]); + dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) { + memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst)); + vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), + size * sizeof(*dst)); + vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size); } } -#define highbd_d63e_predictor highbd_d63_predictor - static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { - int r, c; + const uint16_t above_right = above[bs - 1]; + const uint16_t *const dst_row0 = dst; + int x, size; (void)left; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r + c + 2 < bs * 2 - ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2]) - : above[bs * 2 - 1]; - } - dst += stride; - } -} -#if CONFIG_MISC_FIXES -static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)left; - (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = AVG3(above[r + c], above[r + c + 1], - above[r + c + 1 + (r + c + 2 < bs * 2)]); - } + for (x = 0; x < bs - 1; ++x) { + dst[x] = AVG3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = above_right; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) { + memcpy(dst, dst_row0 + x, size * sizeof(*dst)); + vpx_memset16(dst + size, above_right, x + 1); dst += stride; } } -#endif // CONFIG_MISC_FIXES static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, @@ -633,19 +556,30 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { - int r, c; + int i; +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint16_t border[69]; +#else + uint16_t border[32 + 32 - 1]; // outer border from bottom-left to top-right +#endif (void)bd; - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; ++r) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + // dst(bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) { + border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = AVG3(above[-1], left[0], left[1]); + border[bs - 1] = AVG3(left[0], above[-1], above[0]); + border[bs - 0] = AVG3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) { + border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); + } - dst += stride; - for (r = 1; r < bs; ++r) { - for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1]; - dst += stride; + for (i = 0; i < bs; ++i) { + memcpy(dst + i * stride, border + bs - 1 - i, bs * sizeof(dst[0])); } } @@ -776,6 +710,144 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, dst += stride; } } + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + (void)above; + (void)bd; + DST(0, 0) = AVG2(I, J); + DST(2, 0) = DST(0, 1) = AVG2(J, K); + DST(2, 1) = DST(0, 2) = AVG2(K, L); + DST(1, 0) = AVG3(I, J, K); + DST(3, 0) = DST(1, 1) = AVG3(J, K, L); + DST(3, 1) = DST(1, 2) = AVG3(K, L, L); + DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; +} + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + (void)left; + (void)bd; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + DST(3, 2) = AVG2(E, F); // differs from vp8 + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 3) = AVG3(E, F, G); // differs from vp8 +} + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)left; + (void)bd; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = H; // differs from vp8 +} + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)bd; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)bd; + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + (void)bd; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} #endif // CONFIG_VP9_HIGHBITDEPTH // This serves as a wrapper function, so that all the prediction functions @@ -811,7 +883,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, intra_pred_sized(type, 8) \ intra_pred_sized(type, 16) \ intra_pred_sized(type, 32) \ - intra_pred_highbd_sized(type, 4) \ intra_pred_highbd_sized(type, 8) \ intra_pred_highbd_sized(type, 16) \ intra_pred_highbd_sized(type, 32) @@ -832,11 +903,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, intra_pred_no_4x4(d207) intra_pred_no_4x4(d63) intra_pred_no_4x4(d45) -#if CONFIG_MISC_FIXES -intra_pred_allsizes(d207e) -intra_pred_allsizes(d63e) -intra_pred_no_4x4(d45e) -#endif intra_pred_no_4x4(d117) intra_pred_no_4x4(d135) intra_pred_no_4x4(d153) diff --git a/libs/libvpx/vpx_dsp/inv_txfm.c b/libs/libvpx/vpx_dsp/inv_txfm.c index 1526663d89..0194aa1e18 100644 --- a/libs/libvpx/vpx_dsp/inv_txfm.c +++ b/libs/libvpx/vpx_dsp/inv_txfm.c @@ -9,6 +9,7 @@ */ #include +#include #include #include "./vpx_dsp_rtcd.h" @@ -66,7 +67,7 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { +void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; @@ -83,179 +84,28 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { for (i = 0; i < 4; i++) { e1 = ip[0] >> 1; a1 = ip[0] - e1; - dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); - dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); - dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); - dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); + dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1); ip++; dest++; } } -void idct4_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1)); - step[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1)); - step[3] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - output[0] = WRAPLOW(step[0] + step[3]); - output[1] = WRAPLOW(step[1] + step[2]); - output[2] = WRAPLOW(step[1] - step[2]); - output[3] = WRAPLOW(step[0] - step[3]); -} - -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - - // Rows - for (i = 0; i < 4; ++i) { - idct4_c(input, outptr); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - idct4_c(temp_in, temp_out); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 4)); - } - } -} - -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - int i; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (i = 0; i < 4; i++) { - dest[0] = clip_pixel_add(dest[0], a1); - dest[1] = clip_pixel_add(dest[1], a1); - dest[2] = clip_pixel_add(dest[2], a1); - dest[3] = clip_pixel_add(dest[3], a1); - dest += dest_stride; - } -} - -void idct8_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - temp1 = (step1[0] + step1[2]) * cospi_16_64; - temp2 = (step1[0] - step1[2]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - // stage 3 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7]); - output[1] = WRAPLOW(step1[1] + step1[6]); - output[2] = WRAPLOW(step1[2] + step1[5]); - output[3] = WRAPLOW(step1[3] + step1[4]); - output[4] = WRAPLOW(step1[3] - step1[4]); - output[5] = WRAPLOW(step1[2] - step1[5]); - output[6] = WRAPLOW(step1[1] - step1[6]); - output[7] = WRAPLOW(step1[0] - step1[7]); -} - -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - for (i = 0; i < 8; ++i) { - idct8_c(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_c(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } -} - -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 5); - for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - void iadst4_c(const tran_low_t *input, tran_low_t *output) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - tran_low_t x0 = input[0]; tran_low_t x1 = input[1]; tran_low_t x2 = input[2]; tran_low_t x3 = input[3]; if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; + memset(output, 0, 4 * sizeof(*output)); return; } + // 32-bit result is enough for the following multiplications. s0 = sinpi_1_9 * x0; s1 = sinpi_2_9 * x0; s2 = sinpi_3_9 * x1; @@ -280,9 +130,71 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) { output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); } +void idct4_c(const tran_low_t *input, tran_low_t *output) { + int16_t step[4]; + tran_high_t temp1, temp2; + + // stage 1 + temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64; + temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64; + step[0] = WRAPLOW(dct_const_round_shift(temp1)); + step[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64; + temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64; + step[2] = WRAPLOW(dct_const_round_shift(temp1)); + step[3] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3]); + output[1] = WRAPLOW(step[1] + step[2]); + output[2] = WRAPLOW(step[1] - step[2]); + output[3] = WRAPLOW(step[0] - step[3]); +} + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + idct4_c(input, outptr); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + idct4_c(temp_in, temp_out); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 4)); + } + } +} + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = clip_pixel_add(dest[0], a1); + dest[1] = clip_pixel_add(dest[1], a1); + dest[2] = clip_pixel_add(dest[2], a1); + dest[3] = clip_pixel_add(dest[3], a1); + dest += stride; + } +} + void iadst8_c(const tran_low_t *input, tran_low_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; - tran_high_t x0 = input[7]; tran_high_t x1 = input[0]; tran_high_t x2 = input[5]; @@ -293,8 +205,7 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { tran_high_t x7 = input[6]; if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = 0; + memset(output, 0, 8 * sizeof(*output)); return; } @@ -357,14 +268,93 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { output[7] = WRAPLOW(-x1); } -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; +void idct8_c(const tran_low_t *input, tran_low_t *output) { + int16_t step1[8], step2[8]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = (int16_t)input[0]; + step1[2] = (int16_t)input[4]; + step1[1] = (int16_t)input[2]; + step1[3] = (int16_t)input[6]; + temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64; + temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64; + temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + temp1 = (step1[0] + step1[2]) * cospi_16_64; + temp2 = (step1[0] - step1[2]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + // stage 3 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); +} + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; // First transform rows - // only first 4 row has non-zero coefs + for (i = 0; i < 8; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // Only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) { idct8_c(input, outptr); input += 8; @@ -382,27 +372,209 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void iadst16_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); +} + void idct16_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[16], step2[16]; + int16_t step1[16], step2[16]; tran_high_t temp1, temp2; // stage 1 - step1[0] = input[0 / 2]; - step1[1] = input[16 / 2]; - step1[2] = input[8 / 2]; - step1[3] = input[24 / 2]; - step1[4] = input[4 / 2]; - step1[5] = input[20 / 2]; - step1[6] = input[12 / 2]; - step1[7] = input[28 / 2]; - step1[8] = input[2 / 2]; - step1[9] = input[18 / 2]; - step1[10] = input[10 / 2]; - step1[11] = input[26 / 2]; - step1[12] = input[6 / 2]; - step1[13] = input[22 / 2]; - step1[14] = input[14 / 2]; - step1[15] = input[30 / 2]; + step1[0] = (int16_t)input[0 / 2]; + step1[1] = (int16_t)input[16 / 2]; + step1[2] = (int16_t)input[8 / 2]; + step1[3] = (int16_t)input[24 / 2]; + step1[4] = (int16_t)input[4 / 2]; + step1[5] = (int16_t)input[20 / 2]; + step1[6] = (int16_t)input[12 / 2]; + step1[7] = (int16_t)input[28 / 2]; + step1[8] = (int16_t)input[2 / 2]; + step1[9] = (int16_t)input[18 / 2]; + step1[10] = (int16_t)input[10 / 2]; + step1[11] = (int16_t)input[26 / 2]; + step1[12] = (int16_t)input[6 / 2]; + step1[13] = (int16_t)input[22 / 2]; + step1[14] = (int16_t)input[14 / 2]; + step1[15] = (int16_t)input[30 / 2]; // stage 2 step2[0] = step1[0]; @@ -549,9 +721,9 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) { void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[16], temp_out[16]; // First transform rows @@ -572,182 +744,37 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, } } -void iadst16_c(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; - tran_high_t x0 = input[15]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[13]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[11]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[9]; - tran_high_t x7 = input[6]; - tran_high_t x8 = input[7]; - tran_high_t x9 = input[8]; - tran_high_t x10 = input[5]; - tran_high_t x11 = input[10]; - tran_high_t x12 = input[3]; - tran_high_t x13 = input[12]; - tran_high_t x14 = input[1]; - tran_high_t x15 = input[14]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = output[8] = output[9] = output[10] = - output[11] = output[12] = output[13] = output[14] = output[15] = 0; - return; + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; } - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = WRAPLOW(s0 + s4); - x1 = WRAPLOW(s1 + s5); - x2 = WRAPLOW(s2 + s6); - x3 = WRAPLOW(s3 + s7); - x4 = WRAPLOW(s0 - s4); - x5 = WRAPLOW(s1 - s5); - x6 = WRAPLOW(s2 - s6); - x7 = WRAPLOW(s3 - s7); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = WRAPLOW(s0 + s2); - x1 = WRAPLOW(s1 + s3); - x2 = WRAPLOW(s0 - s2); - x3 = WRAPLOW(s1 - s3); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); - x8 = WRAPLOW(s8 + s10); - x9 = WRAPLOW(s9 + s11); - x10 = WRAPLOW(s8 - s10); - x11 = WRAPLOW(s9 - s11); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = WRAPLOW(dct_const_round_shift(s2)); - x3 = WRAPLOW(dct_const_round_shift(s3)); - x6 = WRAPLOW(dct_const_round_shift(s6)); - x7 = WRAPLOW(dct_const_round_shift(s7)); - x10 = WRAPLOW(dct_const_round_shift(s10)); - x11 = WRAPLOW(dct_const_round_shift(s11)); - x14 = WRAPLOW(dct_const_round_shift(s14)); - x15 = WRAPLOW(dct_const_round_shift(s15)); - - output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x8); - output[2] = WRAPLOW(x12); - output[3] = WRAPLOW(-x4); - output[4] = WRAPLOW(x6); - output[5] = WRAPLOW(x14); - output[6] = WRAPLOW(x10); - output[7] = WRAPLOW(x2); - output[8] = WRAPLOW(x3); - output[9] = WRAPLOW(x11); - output[10] = WRAPLOW(x15); - output[11] = WRAPLOW(x7); - output[12] = WRAPLOW(x5); - output[13] = WRAPLOW(-x13); - output[14] = WRAPLOW(x9); - output[15] = WRAPLOW(-x1); + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } } void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[16], temp_out[16]; // First transform rows. Since all non-zero dct coefficients are in @@ -772,7 +799,9 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { @@ -782,64 +811,64 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } void idct32_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[32], step2[32]; + int16_t step1[32], step2[32]; tran_high_t temp1, temp2; // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; + step1[0] = (int16_t)input[0]; + step1[1] = (int16_t)input[16]; + step1[2] = (int16_t)input[8]; + step1[3] = (int16_t)input[24]; + step1[4] = (int16_t)input[4]; + step1[5] = (int16_t)input[20]; + step1[6] = (int16_t)input[12]; + step1[7] = (int16_t)input[28]; + step1[8] = (int16_t)input[2]; + step1[9] = (int16_t)input[18]; + step1[10] = (int16_t)input[10]; + step1[11] = (int16_t)input[26]; + step1[12] = (int16_t)input[6]; + step1[13] = (int16_t)input[22]; + step1[14] = (int16_t)input[14]; + step1[15] = (int16_t)input[30]; - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64; + temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64; step1[16] = WRAPLOW(dct_const_round_shift(temp1)); step1[31] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64; + temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64; step1[17] = WRAPLOW(dct_const_round_shift(temp1)); step1[30] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64; + temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64; step1[18] = WRAPLOW(dct_const_round_shift(temp1)); step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64; + temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64; step1[19] = WRAPLOW(dct_const_round_shift(temp1)); step1[28] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64; + temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64; step1[20] = WRAPLOW(dct_const_round_shift(temp1)); step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64; + temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64; step1[21] = WRAPLOW(dct_const_round_shift(temp1)); step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64; + temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64; step1[22] = WRAPLOW(dct_const_round_shift(temp1)); step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64; + temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64; step1[23] = WRAPLOW(dct_const_round_shift(temp1)); step1[24] = WRAPLOW(dct_const_round_shift(temp2)); @@ -1150,23 +1179,17 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) { void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[32 * 32]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; // Rows for (i = 0; i < 32; ++i) { - int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + int16_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; - if (zero_coeff[0] | zero_coeff[1]) + if (zero_coeff) idct32_c(input, outptr); else memset(outptr, 0, sizeof(tran_low_t) * 32); @@ -1187,13 +1210,13 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; // Rows - // only upper-left 16x16 has non-zero coeff + // Only upper-left 16x16 has non-zero coeff for (i = 0; i < 16; ++i) { idct32_c(input, outptr); input += 32; @@ -1213,13 +1236,13 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; // Rows - // only upper-left 8x8 has non-zero coeff + // Only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) { idct32_c(input, outptr); input += 32; @@ -1240,8 +1263,9 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -1252,7 +1276,20 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + +// 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse +// transform amplify bits + 1 bit for contingency in rounding and quantizing +#define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25) + +static INLINE int detect_invalid_highbd_input(const tran_low_t *input, + int size) { + int i; + for (i = 0; i < size; ++i) + if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1; + return 0; +} + +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ @@ -1261,7 +1298,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, tran_high_t a1, b1, c1, d1, e1; const tran_low_t *ip = input; tran_low_t *op = output; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); for (i = 0; i < 4; i++) { a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1310,14 +1346,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, - int dest_stride, int bd) { +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest, + int stride, int bd) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; const tran_low_t *ip = in; tran_low_t *op = tmp; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); (void)bd; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1330,32 +1365,84 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, for (i = 0; i < 4; i++) { e1 = ip[0] >> 1; a1 = ip[0] - e1; - dest[dest_stride * 0] = - highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); - dest[dest_stride * 1] = - highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); - dest[dest_stride * 2] = - highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); - dest[dest_stride * 3] = - highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd); ip++; dest++; } } +void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + (void)bd; + + if (detect_invalid_highbd_input(input, 4)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 4); + return; + } + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + s0 = (tran_high_t)sinpi_1_9 * x0; + s1 = (tran_high_t)sinpi_2_9 * x0; + s2 = (tran_high_t)sinpi_3_9 * x1; + s3 = (tran_high_t)sinpi_4_9 * x2; + s4 = (tran_high_t)sinpi_1_9 * x2; + s5 = (tran_high_t)sinpi_2_9 * x3; + s6 = (tran_high_t)sinpi_4_9 * x3; + s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); + output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); + output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); +} + void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step[4]; tran_high_t temp1, temp2; (void)bd; + + if (detect_invalid_highbd_input(input, 4)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 4); + return; + } + // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64; + temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64; + step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64; + temp2 = + input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64; + step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); // stage 2 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); @@ -1364,13 +1451,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); } -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { + int i, j; tran_low_t out[4 * 4]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 4; ++i) { @@ -1390,15 +1476,15 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, - int dest_stride, int bd) { +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { int i; tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -1406,26 +1492,123 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); - dest += dest_stride; + dest += stride; } } +void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[7]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[5]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[3]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[1]; + tran_low_t x7 = input[6]; + (void)bd; + + if (detect_invalid_highbd_input(input, 8)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 8); + return; + } + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1; + s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1; + s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3; + s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3; + s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5; + s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5; + s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7; + s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7; + + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5; + s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5; + s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7; + s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + + // stage 3 + s2 = (tran_high_t)cospi_16_64 * (x2 + x3); + s3 = (tran_high_t)cospi_16_64 * (x2 - x3); + s6 = (tran_high_t)cospi_16_64 * (x6 + x7); + s7 = (tran_high_t)cospi_16_64 * (x6 - x7); + + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x4, bd); + output[2] = HIGHBD_WRAPLOW(x6, bd); + output[3] = HIGHBD_WRAPLOW(-x2, bd); + output[4] = HIGHBD_WRAPLOW(x3, bd); + output[5] = HIGHBD_WRAPLOW(-x7, bd); + output[6] = HIGHBD_WRAPLOW(x5, bd); + output[7] = HIGHBD_WRAPLOW(-x1, bd); +} + void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step1[8], step2[8]; tran_high_t temp1, temp2; + + if (detect_invalid_highbd_input(input, 8)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 8); + return; + } + // stage 1 step1[0] = input[0]; step1[2] = input[4]; step1[1] = input[2]; step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64; + temp2 = + input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64; + temp2 = + input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); // stage 2 & stage 3 - even half vpx_highbd_idct4_c(step1, step1, bd); @@ -1438,10 +1621,10 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { // stage 3 - odd half step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; // stage 4 @@ -1455,22 +1638,21 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); } -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { + int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - // First transform rows. + // First transform rows for (i = 0; i < 8; ++i) { vpx_highbd_idct8_c(input, outptr, bd); input += 8; outptr += 8; } - // Then transform columns. + // Then transform columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; vpx_highbd_idct8_c(temp_in, temp_out, bd); @@ -1481,14 +1663,41 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // Only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); @@ -1496,145 +1705,181 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; +void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + tran_low_t x0 = input[15]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[13]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[11]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[9]; + tran_low_t x7 = input[6]; + tran_low_t x8 = input[7]; + tran_low_t x9 = input[8]; + tran_low_t x10 = input[5]; + tran_low_t x11 = input[10]; + tran_low_t x12 = input[3]; + tran_low_t x13 = input[12]; + tran_low_t x14 = input[1]; + tran_low_t x15 = input[14]; (void)bd; - if (!(x0 | x1 | x2 | x3)) { - memset(output, 0, 4 * sizeof(*output)); + if (detect_invalid_highbd_input(input, 16)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 16); return; } - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd); - output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd); - output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd); -} - -void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[7]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[5]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[3]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[1]; - tran_low_t x7 = input[6]; - (void)bd; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - memset(output, 0, 8 * sizeof(*output)); + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); return; } // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64; + s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64; + s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64; + s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64; + s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64; + s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64; + s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64; + s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64; + s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64; + s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64; + s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64; + s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64; + s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64; + s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64; + s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64; + s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64; - x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd); - x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd); + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); // stage 2 s0 = x0; s1 = x1; s2 = x2; s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64; + s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64; + s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64; + s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64; + s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64; + s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64; + s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64; + s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64; + + x0 = HIGHBD_WRAPLOW(s0 + s4, bd); + x1 = HIGHBD_WRAPLOW(s1 + s5, bd); + x2 = HIGHBD_WRAPLOW(s2 + s6, bd); + x3 = HIGHBD_WRAPLOW(s3 + s7, bd); + x4 = HIGHBD_WRAPLOW(s0 - s4, bd); + x5 = HIGHBD_WRAPLOW(s1 - s5, bd); + x6 = HIGHBD_WRAPLOW(s2 - s6, bd); + x7 = HIGHBD_WRAPLOW(s3 - s7, bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64; + s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64; + s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64; + s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64; + s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64; + s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64; + s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64; x0 = HIGHBD_WRAPLOW(s0 + s2, bd); x1 = HIGHBD_WRAPLOW(s1 + s3, bd); x2 = HIGHBD_WRAPLOW(s0 - s2, bd); x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x8 = HIGHBD_WRAPLOW(s8 + s10, bd); + x9 = HIGHBD_WRAPLOW(s9 + s11, bd); + x10 = HIGHBD_WRAPLOW(s8 - s10, bd); + x11 = HIGHBD_WRAPLOW(s9 - s11, bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); - // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); + // stage 4 + s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3); + s3 = (tran_high_t)cospi_16_64 * (x2 - x3); + s6 = (tran_high_t)cospi_16_64 * (x6 + x7); + s7 = (tran_high_t)cospi_16_64 * (-x6 + x7); + s10 = (tran_high_t)cospi_16_64 * (x10 + x11); + s11 = (tran_high_t)cospi_16_64 * (-x10 + x11); + s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15); + s15 = (tran_high_t)cospi_16_64 * (x14 - x15); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x4, bd); - output[2] = HIGHBD_WRAPLOW(x6, bd); - output[3] = HIGHBD_WRAPLOW(-x2, bd); - output[4] = HIGHBD_WRAPLOW(x3, bd); - output[5] = HIGHBD_WRAPLOW(-x7, bd); - output[6] = HIGHBD_WRAPLOW(x5, bd); - output[7] = HIGHBD_WRAPLOW(-x1, bd); -} - -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows. - // Only first 4 row has non-zero coefs. - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - // Then transform columns. - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } + output[1] = HIGHBD_WRAPLOW(-x8, bd); + output[2] = HIGHBD_WRAPLOW(x12, bd); + output[3] = HIGHBD_WRAPLOW(-x4, bd); + output[4] = HIGHBD_WRAPLOW(x6, bd); + output[5] = HIGHBD_WRAPLOW(x14, bd); + output[6] = HIGHBD_WRAPLOW(x10, bd); + output[7] = HIGHBD_WRAPLOW(x2, bd); + output[8] = HIGHBD_WRAPLOW(x3, bd); + output[9] = HIGHBD_WRAPLOW(x11, bd); + output[10] = HIGHBD_WRAPLOW(x15, bd); + output[11] = HIGHBD_WRAPLOW(x7, bd); + output[12] = HIGHBD_WRAPLOW(x5, bd); + output[13] = HIGHBD_WRAPLOW(-x13, bd); + output[14] = HIGHBD_WRAPLOW(x9, bd); + output[15] = HIGHBD_WRAPLOW(-x1, bd); } void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { @@ -1642,6 +1887,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_high_t temp1, temp2; (void)bd; + if (detect_invalid_highbd_input(input, 16)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 16); + return; + } + // stage 1 step1[0] = input[0 / 2]; step1[1] = input[16 / 2]; @@ -1670,25 +1923,33 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; + temp2 = + step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; + step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step1[9] * (tran_high_t)cospi_14_64 - + step1[14] * (tran_high_t)cospi_18_64; + temp2 = step1[9] * (tran_high_t)cospi_18_64 + + step1[14] * (tran_high_t)cospi_14_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step1[10] * (tran_high_t)cospi_22_64 - + step1[13] * (tran_high_t)cospi_10_64; + temp2 = step1[10] * (tran_high_t)cospi_10_64 + + step1[13] * (tran_high_t)cospi_22_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step1[11] * (tran_high_t)cospi_6_64 - + step1[12] * (tran_high_t)cospi_26_64; + temp2 = step1[11] * (tran_high_t)cospi_26_64 + + step1[12] * (tran_high_t)cospi_6_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); // stage 3 step1[0] = step2[0]; @@ -1696,14 +1957,18 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; + temp2 = + step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; + temp2 = + step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); @@ -1715,14 +1980,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; + temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; + step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; + temp2 = + step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; + step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); @@ -1730,14 +1997,18 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step1[9] * (tran_high_t)cospi_8_64 + + step1[14] * (tran_high_t)cospi_24_64; + temp2 = + step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step1[10] * (tran_high_t)cospi_24_64 - + step1[13] * (tran_high_t)cospi_8_64; + temp2 = -step1[10] * (tran_high_t)cospi_8_64 + + step1[13] * (tran_high_t)cospi_24_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; @@ -1747,10 +2018,10 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); @@ -1773,14 +2044,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; @@ -1803,22 +2074,21 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); } -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { + int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - // First transform rows. + // First transform rows for (i = 0; i < 16; ++i) { vpx_highbd_idct16_c(input, outptr, bd); input += 16; outptr += 16; } - // Then transform columns. + // Then transform columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; vpx_highbd_idct16_c(temp_in, temp_out, bd); @@ -1829,183 +2099,40 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - - tran_low_t x0 = input[15]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[13]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[11]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[9]; - tran_low_t x7 = input[6]; - tran_low_t x8 = input[7]; - tran_low_t x9 = input[8]; - tran_low_t x10 = input[5]; - tran_low_t x11 = input[10]; - tran_low_t x12 = input[3]; - tran_low_t x13 = input[12]; - tran_low_t x14 = input[1]; - tran_low_t x15 = input[14]; - (void)bd; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - memset(output, 0, 16 * sizeof(*output)); - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd); - x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd); - x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd); - x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = HIGHBD_WRAPLOW(s0 + s4, bd); - x1 = HIGHBD_WRAPLOW(s1 + s5, bd); - x2 = HIGHBD_WRAPLOW(s2 + s6, bd); - x3 = HIGHBD_WRAPLOW(s3 + s7, bd); - x4 = HIGHBD_WRAPLOW(s0 - s4, bd); - x5 = HIGHBD_WRAPLOW(s1 - s5, bd); - x6 = HIGHBD_WRAPLOW(s2 - s6, bd); - x7 = HIGHBD_WRAPLOW(s3 - s7, bd); - x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd); - x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = HIGHBD_WRAPLOW(s0 + s2, bd); - x1 = HIGHBD_WRAPLOW(s1 + s3, bd); - x2 = HIGHBD_WRAPLOW(s0 - s2, bd); - x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); - x8 = HIGHBD_WRAPLOW(s8 + s10, bd); - x9 = HIGHBD_WRAPLOW(s9 + s11, bd); - x10 = HIGHBD_WRAPLOW(s8 - s10, bd); - x11 = HIGHBD_WRAPLOW(s9 - s11, bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd); - - output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x8, bd); - output[2] = HIGHBD_WRAPLOW(x12, bd); - output[3] = HIGHBD_WRAPLOW(-x4, bd); - output[4] = HIGHBD_WRAPLOW(x6, bd); - output[5] = HIGHBD_WRAPLOW(x14, bd); - output[6] = HIGHBD_WRAPLOW(x10, bd); - output[7] = HIGHBD_WRAPLOW(x2, bd); - output[8] = HIGHBD_WRAPLOW(x3, bd); - output[9] = HIGHBD_WRAPLOW(x11, bd); - output[10] = HIGHBD_WRAPLOW(x15, bd); - output[11] = HIGHBD_WRAPLOW(x7, bd); - output[12] = HIGHBD_WRAPLOW(x5, bd); - output[13] = HIGHBD_WRAPLOW(-x13, bd); - output[14] = HIGHBD_WRAPLOW(x9, bd); - output[15] = HIGHBD_WRAPLOW(-x1, bd); -} - -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + uint16_t *destT = dest; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + destT[i] = highbd_clip_pixel_add(destT[i], + ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + destT += stride; + } + } +} + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. @@ -2015,7 +2142,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, outptr += 16; } - // Then transform columns. + // Then transform columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; vpx_highbd_idct16_c(temp_in, temp_out, bd); @@ -2026,15 +2153,15 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); @@ -2048,6 +2175,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, tran_high_t temp1, temp2; (void)bd; + if (detect_invalid_highbd_input(input, 32)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 32); + return; + } + // stage 1 step1[0] = input[0]; step1[1] = input[16]; @@ -2066,45 +2201,61 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[14] = input[14]; step1[15] = input[30]; - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64; + temp2 = + input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64; + step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = input[17] * (tran_high_t)cospi_15_64 - + input[15] * (tran_high_t)cospi_17_64; + temp2 = input[17] * (tran_high_t)cospi_17_64 + + input[15] * (tran_high_t)cospi_15_64; + step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64; + temp2 = + input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64; + temp2 = + input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64; + step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64; + temp2 = + input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = input[21] * (tran_high_t)cospi_11_64 - + input[11] * (tran_high_t)cospi_21_64; + temp2 = input[21] * (tran_high_t)cospi_21_64 + + input[11] * (tran_high_t)cospi_11_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = input[13] * (tran_high_t)cospi_19_64 - + input[19] * (tran_high_t)cospi_13_64; + temp2 = input[13] * (tran_high_t)cospi_13_64 + + input[19] * (tran_high_t)cospi_19_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64; + temp2 = + input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64; + step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); // stage 2 step2[0] = step1[0]; @@ -2116,25 +2267,33 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; + temp2 = + step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; + step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step1[9] * (tran_high_t)cospi_14_64 - + step1[14] * (tran_high_t)cospi_18_64; + temp2 = step1[9] * (tran_high_t)cospi_18_64 + + step1[14] * (tran_high_t)cospi_14_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step1[10] * (tran_high_t)cospi_22_64 - + step1[13] * (tran_high_t)cospi_10_64; + temp2 = step1[10] * (tran_high_t)cospi_10_64 + + step1[13] * (tran_high_t)cospi_22_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step1[11] * (tran_high_t)cospi_6_64 - + step1[12] * (tran_high_t)cospi_26_64; + temp2 = step1[11] * (tran_high_t)cospi_26_64 + + step1[12] * (tran_high_t)cospi_6_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); @@ -2159,14 +2318,18 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = + step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; + temp2 = + step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; + temp2 = + step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); @@ -2179,38 +2342,48 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[16] = step2[16]; step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step2[17] * (tran_high_t)cospi_4_64 + + step2[30] * (tran_high_t)cospi_28_64; + temp2 = step2[17] * (tran_high_t)cospi_28_64 + + step2[30] * (tran_high_t)cospi_4_64; + step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[18] * (tran_high_t)cospi_28_64 - + step2[29] * (tran_high_t)cospi_4_64; + temp2 = -step2[18] * (tran_high_t)cospi_4_64 + + step2[29] * (tran_high_t)cospi_28_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[19] = step2[19]; step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step2[21] * (tran_high_t)cospi_20_64 + + step2[26] * (tran_high_t)cospi_12_64; + temp2 = step2[21] * (tran_high_t)cospi_12_64 + + step2[26] * (tran_high_t)cospi_20_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[22] * (tran_high_t)cospi_12_64 - + step2[25] * (tran_high_t)cospi_20_64; + temp2 = -step2[22] * (tran_high_t)cospi_20_64 + + step2[25] * (tran_high_t)cospi_12_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; step1[28] = step2[28]; // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; + temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; + step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; + temp2 = + step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; + step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); @@ -2218,14 +2391,18 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step1[9] * (tran_high_t)cospi_8_64 + + step1[14] * (tran_high_t)cospi_24_64; + temp2 = + step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step1[10] * (tran_high_t)cospi_24_64 - + step1[13] * (tran_high_t)cospi_8_64; + temp2 = -step1[10] * (tran_high_t)cospi_8_64 + + step1[13] * (tran_high_t)cospi_24_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; @@ -2253,10 +2430,10 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); @@ -2270,22 +2447,30 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[16] = step2[16]; step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step2[18] * (tran_high_t)cospi_8_64 + + step2[29] * (tran_high_t)cospi_24_64; + temp2 = step2[18] * (tran_high_t)cospi_24_64 + + step2[29] * (tran_high_t)cospi_8_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[19] * (tran_high_t)cospi_8_64 + + step2[28] * (tran_high_t)cospi_24_64; + temp2 = step2[19] * (tran_high_t)cospi_24_64 + + step2[28] * (tran_high_t)cospi_8_64; + step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[20] * (tran_high_t)cospi_24_64 - + step2[27] * (tran_high_t)cospi_8_64; + temp2 = -step2[20] * (tran_high_t)cospi_8_64 + + step2[27] * (tran_high_t)cospi_24_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[21] * (tran_high_t)cospi_24_64 - + step2[26] * (tran_high_t)cospi_8_64; + temp2 = -step2[21] * (tran_high_t)cospi_8_64 + + step2[26] * (tran_high_t)cospi_24_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -2304,14 +2489,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; @@ -2355,22 +2540,22 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[17] = step2[17]; step1[18] = step2[18]; step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64; + temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64; + temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64; + temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64; + temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64; + step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; @@ -2411,26 +2596,19 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); } -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { + int i, j; tran_low_t out[32 * 32]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 32; ++i) { - tran_low_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + tran_low_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; - if (zero_coeff[0] | zero_coeff[1]) + if (zero_coeff) highbd_idct32_c(input, outptr, bd); else memset(outptr, 0, sizeof(tran_low_t) * 32); @@ -2449,21 +2627,49 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows - // Only upper-left 8x8 has non-zero coeff. + // Only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + highbd_idct32_c(input, outptr, bd); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + uint16_t *destT = dest; + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + destT[i] = highbd_clip_pixel_add(destT[i], + ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + destT += stride; + } + } +} + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // Only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) { highbd_idct32_c(input, outptr, bd); input += 32; outptr += 32; } + // Columns for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; @@ -2475,15 +2681,15 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; int a1; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { @@ -2491,4 +2697,5 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, dest += stride; } } + #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vpx_dsp/inv_txfm.h b/libs/libvpx/vpx_dsp/inv_txfm.h index e530730d57..13137659fa 100644 --- a/libs/libvpx/vpx_dsp/inv_txfm.h +++ b/libs/libvpx/vpx_dsp/inv_txfm.h @@ -57,11 +57,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) { (void)bd; return input; } - -static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return (tran_high_t)rv; -} #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_EMULATE_HARDWARE diff --git a/libs/libvpx/vpx_dsp/loopfilter.c b/libs/libvpx/vpx_dsp/loopfilter.c index 2f7eff8b05..9866ea37d6 100644 --- a/libs/libvpx/vpx_dsp/loopfilter.c +++ b/libs/libvpx/vpx_dsp/loopfilter.c @@ -94,8 +94,8 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; // save bottom 3 bits so that we round one side +4 and the other +3 - // if it equals 4 we'll set to adjust by -1 to account for the fact - // we'd round 3 the other way + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way filter1 = signed_char_clamp(filter + 4) >> 3; filter2 = signed_char_clamp(filter + 3) >> 3; @@ -308,12 +308,12 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, } } -void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { +void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); } -void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); } @@ -425,8 +425,8 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; // Save bottom 3 bits so that we round one side +4 and the other +3 - // if it equals 4 we'll set to adjust by -1 to account for the fact - // we'd round 3 the other way. + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way. filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; @@ -673,14 +673,13 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, } } -void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); } -void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { diff --git a/libs/libvpx/vpx_dsp/mips/add_noise_msa.c b/libs/libvpx/vpx_dsp/mips/add_noise_msa.c index e372b9d8c9..43d2c1146e 100644 --- a/libs/libvpx/vpx_dsp/mips/add_noise_msa.c +++ b/libs/libvpx/vpx_dsp/mips/add_noise_msa.c @@ -14,7 +14,14 @@ void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int32_t pitch) { - uint32_t i, j; + int i, j; + v16u8 pos0, pos1, ref0, ref1; + v16i8 black_clamp, white_clamp, both_clamp; + + black_clamp = __msa_fill_b(blackclamp); + white_clamp = __msa_fill_b(whiteclamp); + both_clamp = black_clamp + white_clamp; + both_clamp = -both_clamp; for (i = 0; i < height / 2; ++i) { uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; @@ -22,29 +29,16 @@ void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise, uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; const int8_t *ref1_ptr = noise + (rand() & 0xff); for (j = width / 16; j--;) { - v16i8 temp00_s, temp01_s; - v16u8 temp00, temp01, black_clamp, white_clamp; - v16u8 pos0, ref0, pos1, ref1; - v16i8 const127 = __msa_ldi_b(127); - pos0 = LD_UB(pos0_ptr); ref0 = LD_UB(ref0_ptr); pos1 = LD_UB(pos1_ptr); ref1 = LD_UB(ref1_ptr); - black_clamp = (v16u8)__msa_fill_b(blackclamp); - white_clamp = (v16u8)__msa_fill_b(whiteclamp); - temp00 = (pos0 < black_clamp); - pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); - temp01 = (pos1 < black_clamp); - pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); - XORI_B2_128_UB(pos0, pos1); - temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); - temp00 = (v16u8)(temp00_s < pos0); - pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); - temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); - temp01 = (temp01_s < pos1); - pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); - XORI_B2_128_UB(pos0, pos1); + pos0 = __msa_subsus_u_b(pos0, black_clamp); + pos1 = __msa_subsus_u_b(pos1, black_clamp); + pos0 = __msa_subsus_u_b(pos0, both_clamp); + pos1 = __msa_subsus_u_b(pos1, both_clamp); + pos0 = __msa_subsus_u_b(pos0, white_clamp); + pos1 = __msa_subsus_u_b(pos1, white_clamp); pos0 += ref0; ST_UB(pos0, pos0_ptr); pos1 += ref1; diff --git a/libs/libvpx/vpx_dsp/mips/avg_msa.c b/libs/libvpx/vpx_dsp/mips/avg_msa.c index 52a24ed379..d0ac7b8e29 100644 --- a/libs/libvpx/vpx_dsp/mips/avg_msa.c +++ b/libs/libvpx/vpx_dsp/mips/avg_msa.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" @@ -54,3 +55,674 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { return sum_out; } + +void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride, + int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8); +} + +void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride, + int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src11, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8); + + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8); + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); +} + +int vpx_satd_msa(const int16_t *data, int length) { + int i, satd; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 src8, src9, src10, src11, src12, src13, src14, src15; + v8i16 zero = { 0 }; + v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h; + v4u32 tmp0_w = { 0 }; + + if (16 == length) { + LD_SH2(data, 8, src0, src1); + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + satd = HADD_UW_U32(tmp0_w); + } else if (64 == length) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + satd = HADD_UW_U32(tmp0_w); + } else if (256 == length) { + for (i = 0; i < 2; ++i) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + data += 8 * 8; + LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); + data += 8 * 8; + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + } + + satd = HADD_UW_U32(tmp0_w); + } else if (1024 == length) { + for (i = 0; i < 8; ++i) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + data += 8 * 8; + LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); + data += 8 * 8; + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + } + + satd = HADD_UW_U32(tmp0_w); + } else { + satd = 0; + + for (i = 0; i < length; ++i) { + satd += abs(data[i]); + } + } + + return satd; +} + +void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref, + const int ref_stride, const int height) { + int i; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v8i16 hbuf_r = { 0 }; + v8i16 hbuf_l = { 0 }; + v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l; + v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l; + + if (16 == height) { + for (i = 2; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 3); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else if (32 == height) { + for (i = 2; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 4); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else if (64 == height) { + for (i = 4; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 5); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else { + const int norm_factor = height >> 1; + int cnt; + + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] = 0; + } + + for (i = 0; i < height; ++i) { + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] += ref[cnt]; + } + + ref += ref_stride; + } + + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] /= norm_factor; + } + } +} + +int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) { + int16_t sum; + v16u8 ref0, ref1, ref2, ref3; + v8u16 ref0_h; + + if (16 == width) { + ref0 = LD_UB(ref); + ref0_h = __msa_hadd_u_h(ref0, ref0); + sum = HADD_UH_U32(ref0_h); + } else if (32 == width) { + LD_UB2(ref, 16, ref0, ref1); + ref0_h = __msa_hadd_u_h(ref0, ref0); + ref0_h += __msa_hadd_u_h(ref1, ref1); + sum = HADD_UH_U32(ref0_h); + } else if (64 == width) { + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref0_h = __msa_hadd_u_h(ref0, ref0); + ref0_h += __msa_hadd_u_h(ref1, ref1); + ref0_h += __msa_hadd_u_h(ref2, ref2); + ref0_h += __msa_hadd_u_h(ref3, ref3); + sum = HADD_UH_U32(ref0_h); + } else { + int idx; + + sum = 0; + for (idx = 0; idx < width; ++idx) { + sum += ref[idx]; + } + } + + return sum; +} + +int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) { + int sse, mean, var; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2; + v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m; + v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m; + v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m; + v4i32 res_l7_m, mean_v; + v2i64 sse_v; + + if (2 == bwl) { + LD_SH2(src, 8, src0, src1); + LD_SH2(ref, 8, ref0, ref1); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else if (3 == bwl) { + LD_SH4(src, 8, src0, src1, src2, src3); + LD_SH4(ref, 8, ref0, ref1, ref2, ref3); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); + ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else if (4 == bwl) { + LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7); + LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); + ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m); + ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m); + ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m); + ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v += res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else { + int i; + const int width = 4 << bwl; + + sse = 0; + mean = 0; + + for (i = 0; i < width; ++i) { + const int diff = ref[i] - src[i]; + + mean += diff; + sse += diff * diff; + } + } + + var = sse - ((mean * mean) >> (bwl + 2)); + + return var; +} + +void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7; + v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1; + + LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); + LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7); + PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3); + PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3); + + diff0 = __msa_asub_u_b(s0, d0); + diff1 = __msa_asub_u_b(s1, d1); + diff2 = __msa_asub_u_b(s2, d2); + diff3 = __msa_asub_u_b(s3, d3); + + min0 = __msa_min_u_b(diff0, diff1); + min1 = __msa_min_u_b(diff2, diff3); + min0 = __msa_min_u_b(min0, min1); + + max0 = __msa_max_u_b(diff0, diff1); + max1 = __msa_max_u_b(diff2, diff3); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1); + max0 = __msa_max_u_b(max0, max1); + + *min = min0[0]; + *max = max0[0]; +} diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c index ae88eddfd6..18e7d5375d 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c @@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; uint32_t pos = 38; assert(y_step_q4 == 16); @@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c index e944207b6e..7dcb662d7f 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; uint32_t pos = 38; assert(x_step_q4 == 16); @@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c index 5cc06b5f26..9e65a8f50f 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c @@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; uint32_t pos = 38; assert(x_step_q4 == 16); @@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c index eb1975e447..a3e967b405 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c @@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; uint32_t pos = 38; assert(y_step_q4 == 16); @@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c index 31812299c3..d9c2bef69e 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c @@ -334,15 +334,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -367,8 +368,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } @@ -376,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { /* Fixed size intermediate buffer places limits on parameters. */ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); @@ -390,21 +391,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, if (intermediate_height < h) intermediate_height = h; - vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, intermediate_height); + vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); - vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { int x, y; - uint32_t tp1, tp2, tn1; - uint32_t tp3, tp4, tn2; + uint32_t tp1, tp2, tn1, tp3, tp4, tn2; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c index 9a9bab25a5..fb68ad8813 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -938,15 +938,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -987,9 +988,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h); + vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c index f6812c7d04..89f0f41962 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c @@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, } void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; uint32_t pos = 38; @@ -1307,6 +1309,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, assert(y_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); assert(((const int32_t *)filter_y)[1] != 0x800000); + (void)x_step_q4; /* bit positon for extract from acc */ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" @@ -1394,10 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { int x, y; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c index 196a0a2f0b..77e95c8444 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c @@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c index ad107d5c47..c329f71ccf 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c @@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h index 4eee3bd5e1..48e440d73c 100644 --- a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h +++ b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h @@ -24,21 +24,21 @@ extern "C" { #if HAVE_DSPR2 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter, int w, @@ -46,9 +46,9 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); #endif // #if HAVE_DSPR2 #ifdef __cplusplus diff --git a/libs/libvpx/vpx_dsp/mips/deblock_msa.c b/libs/libvpx/vpx_dsp/mips/deblock_msa.c index 402d7ed997..aafa272fbd 100644 --- a/libs/libvpx/vpx_dsp/mips/deblock_msa.c +++ b/libs/libvpx/vpx_dsp/mips/deblock_msa.c @@ -9,6 +9,7 @@ */ #include + #include "./macros_msa.h" extern const int16_t vpx_rv[]; @@ -295,6 +296,7 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, uint8_t *p_dst_st = dst_ptr; uint8_t *f_orig = f; uint16_t col; + uint64_t out0, out1, out2, out3; v16u8 above2, above1, below2, below1; v16u8 src, ref, ref_temp; v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6; @@ -346,6 +348,67 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, f += 16; } + if (0 != (cols / 16)) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + src = LD_UB(p_src + 10 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); + below1 = LD_UB(p_src + 11 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); + below2 = LD_UB(p_src + 12 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); + above2 = LD_UB(p_src + 13 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); + above1 = LD_UB(p_src + 14 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); + src = LD_UB(p_src + 15 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); + below1 = LD_UB(p_src + 16 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); + below2 = LD_UB(p_src + 17 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); + out0 = __msa_copy_u_d((v2i64)inter0, 0); + out1 = __msa_copy_u_d((v2i64)inter1, 0); + out2 = __msa_copy_u_d((v2i64)inter2, 0); + out3 = __msa_copy_u_d((v2i64)inter3, 0); + SD4(out0, out1, out2, out3, p_dst, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter4, 0); + out1 = __msa_copy_u_d((v2i64)inter5, 0); + out2 = __msa_copy_u_d((v2i64)inter6, 0); + out3 = __msa_copy_u_d((v2i64)inter7, 0); + SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter8, 0); + out1 = __msa_copy_u_d((v2i64)inter9, 0); + out2 = __msa_copy_u_d((v2i64)inter10, 0); + out3 = __msa_copy_u_d((v2i64)inter11, 0); + SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter12, 0); + out1 = __msa_copy_u_d((v2i64)inter13, 0); + out2 = __msa_copy_u_d((v2i64)inter14, 0); + out3 = __msa_copy_u_d((v2i64)inter15, 0); + SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride); + } + f = f_orig; p_dst = dst_ptr - 2; LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, @@ -454,12 +517,12 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, v16u8 tmp = { 0 }; v16i8 zero = { 0 }; v8u16 sum_h, src_r_h, src_l_h; - v4u32 src_r_w, src_l_w; + v4u32 src_r_w; v4i32 flimit_vec; flimit_vec = __msa_fill_w(flimit); for (row = rows; row--;) { - int32_t sum_sq = 0; + int32_t sum_sq; int32_t sum = 0; src0 = (v16u8)__msa_fill_b(src_dup[0]); ST8x1_UB(src0, (src_dup - 8)); @@ -473,9 +536,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, src[15] = 0; ILVRL_B2_UH(zero, src, src_r_h, src_l_h); src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); - src_l_w = __msa_dotp_u_w(src_l_h, src_l_h); - sum_sq = HADD_SW_S32(src_r_w); - sum_sq += HADD_SW_S32(src_l_w); + src_r_w += __msa_dotp_u_w(src_l_h, src_l_h); + sum_sq = HADD_SW_S32(src_r_w) + 16; sum_h = __msa_hadd_u_h(src, src); sum = HADD_UH_U32(sum_h); { @@ -574,7 +636,6 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, int32_t cols, int32_t flimit) { int32_t row, col, cnt, i; - const int16_t *rv3 = &vpx_rv[63 & rand()]; v4i32 flimit_vec; v16u8 dst7, dst8, dst_r_b, dst_l_b; v16i8 mask; @@ -602,7 +663,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, dst = LD_UB(dst_tmp); for (cnt = (col << 4), i = 0; i < 16; ++cnt) { - rv2[i] = rv3 + ((cnt * 17) & 127); + rv2[i] = vpx_rv + (i & 7); ++i; } for (cnt = -8; cnt < 0; ++cnt) { diff --git a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c index e41a904808..06fdc951e7 100644 --- a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c +++ b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c @@ -927,21 +927,21 @@ void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out, } void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - int sum = LD_HADD(input, stride); - sum += LD_HADD(input + 8, stride); - sum += LD_HADD(input + 16, stride); - sum += LD_HADD(input + 24, stride); - sum += LD_HADD(input + 32 * 8, stride); - sum += LD_HADD(input + 32 * 8 + 8, stride); - sum += LD_HADD(input + 32 * 8 + 16, stride); - sum += LD_HADD(input + 32 * 8 + 24, stride); - sum += LD_HADD(input + 32 * 16, stride); - sum += LD_HADD(input + 32 * 16 + 8, stride); - sum += LD_HADD(input + 32 * 16 + 16, stride); - sum += LD_HADD(input + 32 * 16 + 24, stride); - sum += LD_HADD(input + 32 * 24, stride); - sum += LD_HADD(input + 32 * 24 + 8, stride); - sum += LD_HADD(input + 32 * 24 + 16, stride); - sum += LD_HADD(input + 32 * 24 + 24, stride); + int sum, i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w = { 0 }; + + for (i = 0; i < 16; ++i) { + LD_SH4(input, 8, in0, in1, in2, in3); + input += stride; + LD_SH4(input, 8, in4, in5, in6, in7); + input += stride; + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w += __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + } + + sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 3); } diff --git a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c index fdead50503..5a6dfcef2f 100644 --- a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c +++ b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c @@ -8,8 +8,23 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" +void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w = __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + out[0] = HADD_SW_S32(vec_w); + out[1] = 0; +} + +#if !CONFIG_VP9_HIGHBITDEPTH void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -215,11 +230,6 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); } -void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - out[0] = LD_HADD(input, stride); - out[1] = 0; -} - void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, int32_t src_stride) { int32_t i; @@ -237,9 +247,26 @@ void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, } void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - int sum = LD_HADD(input, stride); - sum += LD_HADD(input + 8, stride); - sum += LD_HADD(input + 16 * 8, stride); - sum += LD_HADD(input + 16 * 8 + 8, stride); + int sum, i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w = { 0 }; + + for (i = 0; i < 4; ++i) { + LD_SH2(input, 8, in0, in1); + input += stride; + LD_SH2(input, 8, in2, in3); + input += stride; + LD_SH2(input, 8, in4, in5); + input += stride; + LD_SH2(input, 8, in6, in7); + input += stride; + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w += __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + } + + sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 1); } +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h index db5e90e7b9..fd589224d3 100644 --- a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h +++ b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h @@ -14,22 +14,6 @@ #include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/txfm_common.h" -#define LD_HADD(psrc, stride) \ - ({ \ - v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ - v4i32 vec_w_m; \ - \ - LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ - ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ - LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ - ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ - in0_m, in4_m); \ - in0_m += in4_m; \ - \ - vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ - HADD_SW_S32(vec_w_m); \ - }) - #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ { \ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ diff --git a/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c index 3e29d0ac39..835e10e125 100644 --- a/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c @@ -15,6 +15,7 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" diff --git a/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c index 9f51d50c75..dce03a2b2a 100644 --- a/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c @@ -14,6 +14,7 @@ void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" diff --git a/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c index eac79d5100..16e7fc5507 100644 --- a/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c @@ -14,6 +14,7 @@ void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h index edd54aec5e..27881f0db6 100644 --- a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h +++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h @@ -57,18 +57,15 @@ extern "C" { out; \ }) -void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output); -void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); void iadst4_dspr2(const int16_t *input, int16_t *output); void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); -void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); void iadst8_dspr2(const int16_t *input, int16_t *output); void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); void iadst16_dspr2(const int16_t *input, int16_t *output); #endif // #if HAVE_DSPR2 diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h index 8b8a4173d2..1fe9b28e8a 100644 --- a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h +++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h @@ -196,18 +196,18 @@ out2, out3) \ { \ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ \ ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ - cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ + cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ - cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ + cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ } /* idct 8x8 macro */ diff --git a/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c index 0ec0c2059f..44ba65c7ac 100644 --- a/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c @@ -389,7 +389,7 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, } } -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_8, step1_9, step1_10, step1_11; @@ -712,14 +712,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "add %[load6], %[step1_1], %[step1_6] \n\t" "add %[load6], %[load6], %[step1_14] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "add %[load5], %[step1_2], %[step1_5] \n\t" @@ -731,14 +731,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "add %[load6], %[step1_3], %[step1_4] \n\t" "add %[load6], %[load6], %[step1_12] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "sub %[load5], %[step1_3], %[step1_4] \n\t" @@ -750,14 +750,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "sub %[load6], %[step1_2], %[step1_5] \n\t" "add %[load6], %[load6], %[step1_10] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "sub %[load5], %[step1_1], %[step1_6] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" @@ -769,14 +769,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "sub %[load6], %[step1_0], %[step1_7] \n\t" "add %[load6], %[load6], %[step1_8] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "sub %[load5], %[step1_0], %[step1_7] \n\t" @@ -788,14 +788,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "sub %[load6], %[step1_1], %[step1_6] \n\t" "sub %[load6], %[load6], %[step1_9] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "sub %[load5], %[step1_2], %[step1_5] \n\t" @@ -807,14 +807,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "sub %[load6], %[step1_3], %[step1_4] \n\t" "sub %[load6], %[load6], %[step1_11] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "add %[load5], %[step1_3], %[step1_4] \n\t" @@ -826,14 +826,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "add %[load6], %[step1_2], %[step1_5] \n\t" "sub %[load6], %[load6], %[step1_13] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "add %[load5], %[step1_1], %[step1_6] \n\t" @@ -845,7 +845,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "add %[load6], %[step1_0], %[step1_7] \n\t" "sub %[load6], %[load6], %[step1_15] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" @@ -856,7 +856,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) : - [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), @@ -869,7 +869,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { } void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { + int stride) { DECLARE_ALIGNED(32, int16_t, out[16 * 16]); uint32_t pos = 45; @@ -880,11 +880,11 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, idct16_rows_dspr2(input, out, 16); // Then transform columns and add to dest - idct16_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, stride); } void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { + int stride) { DECLARE_ALIGNED(32, int16_t, out[16 * 16]); int16_t *outptr = out; uint32_t i; @@ -924,11 +924,11 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, } // Then transform columns - idct16_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, stride); } void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { + int stride) { uint32_t pos = 45; int32_t out; int32_t r; @@ -975,13 +975,54 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, "sw %[vector_2], 4(%[dest]) \n\t" "sw %[vector_3], 8(%[dest]) \n\t" "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 1; + a12 = a1 - a11; + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); } } else { /* use quad-byte @@ -1005,13 +1046,13 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, "sw %[vector_2], 4(%[dest]) \n\t" "sw %[vector_3], 8(%[dest]) \n\t" "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } } } diff --git a/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c index ce25d55c9c..3f043b48ba 100644 --- a/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c @@ -13,26 +13,25 @@ #include "vpx_dsp/txfm_common.h" #if HAVE_DSPR2 -void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; - int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; - int16_t step1_27, step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; - int16_t step3_28, step3_29, step3_30, step3_31; +void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int step1_28, step1_29, step1_30, step1_31; + int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int step2_28, step2_29, step2_30, step2_31; + int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int step3_29, step3_30, step3_31; int temp0, temp1, temp2, temp3; int load1, load2, load3, load4; int result1, result2; - int i, temp21; + int i; uint8_t *dest_pix, *dest_pix1; const int const_2_power_13 = 8192; uint8_t *cm = vpx_ff_cropTbl; @@ -49,7 +48,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, for (i = 0; i < 32; ++i) { dest_pix = dest + i; - dest_pix1 = dest + i + 31 * dest_stride; + dest_pix1 = dest + i + 31 * stride; __asm__ __volatile__( "lh %[load1], 2(%[input]) \n\t" @@ -103,9 +102,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), - [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), - [step1_31] "=r"(step1_31) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17), + [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), @@ -163,9 +162,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), - [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), - [step1_29] "=r"(step1_29) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19), + [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), @@ -223,9 +222,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), - [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), - [step1_27] "=r"(step1_27) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), @@ -279,9 +278,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), - [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), - [step1_25] "=r"(step1_25) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), @@ -335,9 +334,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), - [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), - [step2_15] "=r"(step2_15) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8), + [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14), + [step2_15] "=&r"(step2_15) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), @@ -391,9 +390,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11), + [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), @@ -434,116 +433,154 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "extp %[step3_11], $ac2, 31 \n\t" "extp %[step3_12], $ac3, 31 \n\t" - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), - [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), - [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), - [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), - [step3_15] "=r"(step3_15) + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8), + [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10), + [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12), + [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14), + [step3_15] "=&r"(step3_15) : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" - - : [step3_18] "=r"(step3_18) - : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), - [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" - - : [step3_19] "=r"(step3_19) - : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), - [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r"(step3_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" + "sub %[temp0], %[step1_17], %[step1_18] \n\t" + "sub %[temp1], %[step1_30], %[step1_29] \n\t" + "add %[step3_17], %[step1_17], %[step1_18] \n\t" + "add %[step3_30], %[step1_30], %[step1_29] \n\t" - : [step3_21] "=r"(step3_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), - [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_29], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29), + [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30) + : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17), + [step1_18] "r"(step1_18), [step1_30] "r"(step1_30), + [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_16], %[step1_19] \n\t" + "sub %[temp1], %[step1_31], %[step1_28] \n\t" + "add %[step3_16], %[step1_16], %[step1_19] \n\t" + "add %[step3_31], %[step1_31], %[step1_28] \n\t" - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_28], $ac1, 31 \n\t" - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31), + [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28) + : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16), + [step1_19] "r"(step1_19), [step1_31] "r"(step1_31), + [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_23], %[step1_20] \n\t" + "sub %[temp1], %[step1_24], %[step1_27] \n\t" + "add %[step3_23], %[step1_23], %[step1_20] \n\t" + "add %[step3_24], %[step1_24], %[step1_27] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_27], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_24_64] \n\t" + "msub $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24), + [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27) + : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23), + [step1_20] "r"(step1_20), [step1_24] "r"(step1_24), + [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_22], %[step1_21] \n\t" + "sub %[temp1], %[step1_25], %[step1_26] \n\t" + "add %[step3_22], %[step1_22], %[step1_21] \n\t" + "add %[step3_25], %[step1_25], %[step1_26] \n\t" + + "msub $ac0, %[temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_26], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25), + [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26) + : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22), + [step1_21] "r"(step1_21), [step1_25] "r"(step1_25), + [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "add %[step2_16], %[step3_16], %[step3_23] \n\t" + "add %[step2_17], %[step3_17], %[step3_22] \n\t" + "add %[step2_18], %[step3_18], %[step3_21] \n\t" + "add %[step2_19], %[step3_19], %[step3_20] \n\t" + "sub %[step2_20], %[step3_19], %[step3_20] \n\t" + "sub %[step2_21], %[step3_18], %[step3_21] \n\t" + "sub %[step2_22], %[step3_17], %[step3_22] \n\t" + "sub %[step2_23], %[step3_16], %[step3_23] \n\t" + + : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17), + [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19), + [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21), + [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23) + : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23), + [step3_17] "r"(step3_17), [step3_22] "r"(step3_22), + [step3_18] "r"(step3_18), [step3_21] "r"(step3_21), + [step3_19] "r"(step3_19), [step3_20] "r"(step3_20)); + + __asm__ __volatile__( + "sub %[step2_24], %[step3_31], %[step3_24] \n\t" + "sub %[step2_25], %[step3_30], %[step3_25] \n\t" + "sub %[step2_26], %[step3_29], %[step3_26] \n\t" + "sub %[step2_27], %[step3_28], %[step3_27] \n\t" + "add %[step2_28], %[step3_28], %[step3_27] \n\t" + "add %[step2_29], %[step3_29], %[step3_26] \n\t" + "add %[step2_30], %[step3_30], %[step3_25] \n\t" + "add %[step2_31], %[step3_31], %[step3_24] \n\t" + + : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28), + [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29), + [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30), + [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31) + : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24), + [step3_30] "r"(step3_30), [step3_25] "r"(step3_25), + [step3_29] "r"(step3_29), [step3_26] "r"(step3_26), + [step3_28] "r"(step3_28), [step3_27] "r"(step3_27)); __asm__ __volatile__( "lh %[load1], 0(%[input]) \n\t" @@ -580,9 +617,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [result1] "=&r"(result1), [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0), + [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), + [step1_3] "=&r"(step1_3) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64)); @@ -638,96 +675,137 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4), + [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6), + [step1_7] "=&r"(step1_7) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), [cospi_16_64] "r"(cospi_16_64)); - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; + __asm__ __volatile__( + "add %[step2_0], %[step1_0], %[step1_7] \n\t" + "add %[step2_1], %[step1_1], %[step1_6] \n\t" + "add %[step2_2], %[step1_2], %[step1_5] \n\t" + "add %[step2_3], %[step1_3], %[step1_4] \n\t" + "sub %[step2_4], %[step1_3], %[step1_4] \n\t" + "sub %[step2_5], %[step1_2], %[step1_5] \n\t" + "sub %[step2_6], %[step1_1], %[step1_6] \n\t" + "sub %[step2_7], %[step1_0], %[step1_7] \n\t" + + : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4), + [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5), + [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6), + [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7) + : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7), + [step1_1] "r"(step1_1), [step1_6] "r"(step1_6), + [step1_2] "r"(step1_2), [step1_5] "r"(step1_5), + [step1_3] "r"(step1_3), [step1_4] "r"(step1_4)); // stage 7 - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; + __asm__ __volatile__( + "add %[step1_0], %[step2_0], %[step3_15] \n\t" + "add %[step1_1], %[step2_1], %[step3_14] \n\t" + "add %[step1_2], %[step2_2], %[step3_13] \n\t" + "add %[step1_3], %[step2_3], %[step3_12] \n\t" + "sub %[step1_12], %[step2_3], %[step3_12] \n\t" + "sub %[step1_13], %[step2_2], %[step3_13] \n\t" + "sub %[step1_14], %[step2_1], %[step3_14] \n\t" + "sub %[step1_15], %[step2_0], %[step3_15] \n\t" + + : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12), + [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13), + [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14), + [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15) + : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15), + [step2_1] "r"(step2_1), [step3_14] "r"(step3_14), + [step2_2] "r"(step2_2), [step3_13] "r"(step3_13), + [step2_3] "r"(step2_3), [step3_12] "r"(step3_12)); + + __asm__ __volatile__( + "add %[step1_4], %[step2_4], %[step3_11] \n\t" + "add %[step1_5], %[step2_5], %[step3_10] \n\t" + "add %[step1_6], %[step2_6], %[step3_9] \n\t" + "add %[step1_7], %[step2_7], %[step3_8] \n\t" + "sub %[step1_8], %[step2_7], %[step3_8] \n\t" + "sub %[step1_9], %[step2_6], %[step3_9] \n\t" + "sub %[step1_10], %[step2_5], %[step3_10] \n\t" + "sub %[step1_11], %[step2_4], %[step3_11] \n\t" + + : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8), + [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9), + [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10), + [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11) + : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11), + [step2_5] "r"(step2_5), [step3_10] "r"(step3_10), + [step2_6] "r"(step2_6), [step3_9] "r"(step3_9), + [step2_7] "r"(step2_7), [step3_8] "r"(step3_8)); __asm__ __volatile__( "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "add %[temp1], %[step2_27], %[step2_20] \n\t" + "sub %[temp2], %[step2_26], %[step2_21] \n\t" + "add %[temp3], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + "extp %[step1_27], $ac1, 31 \n\t" + "extp %[step1_21], $ac2, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" - : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20), + [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26) : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), - [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + [step2_27] "r"(step2_27), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64)); __asm__ __volatile__( "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "add %[temp1], %[step2_25], %[step2_22] \n\t" + "sub %[temp2], %[step2_24], %[step2_23] \n\t" + "add %[temp3], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + "extp %[step1_25], $ac1, 31 \n\t" + "extp %[step1_23], $ac2, 31 \n\t" + "extp %[step1_24], $ac3, 31 \n\t" - : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) - : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), - [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) - : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), - [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22), + [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24) + : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22), + [step2_25] "r"(step2_25), [step2_23] "r"(step2_23), + [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64)); __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" @@ -738,14 +816,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_1], %[step2_30] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_2], %[step2_29] \n\t" @@ -755,18 +833,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_3], %[step2_28] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), @@ -782,29 +860,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_14] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_13] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_12] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" @@ -815,14 +893,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_5], %[step1_26] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_6], %[step1_25] \n\t" @@ -832,18 +910,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_7], %[step1_24] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4), + : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), @@ -859,29 +937,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_14] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_13] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_12] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" @@ -892,14 +970,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_9], %[step1_22] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_10], %[step1_21] \n\t" @@ -909,18 +987,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_11], %[step1_20] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8), + : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), @@ -936,29 +1014,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_14] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_13] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_12] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" @@ -969,14 +1047,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_13], %[step2_18] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_14], %[step2_17] \n\t" @@ -986,7 +1064,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_15], %[step2_16] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" @@ -996,11 +1074,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), - [step1_14] "r"(step1_14), [step1_15] "r"(step1_15), - [step2_16] "r"(step2_16), [step2_17] "r"(step2_17), - [step2_18] "r"(step2_18), [step2_19] "r"(step2_19)); + : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12), + [step1_13] "r"(step1_13), [step1_14] "r"(step1_14), + [step1_15] "r"(step1_15), [step2_16] "r"(step2_16), + [step2_17] "r"(step2_17), [step2_18] "r"(step2_18), + [step2_19] "r"(step2_19)); step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); @@ -1012,18 +1090,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_14] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_13] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_12] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" @@ -1031,9 +1109,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); input += 32; } diff --git a/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c index d71c5ffed5..3c0468c00f 100644 --- a/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c @@ -18,24 +18,23 @@ #if HAVE_DSPR2 static void idct32_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; - int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; - int16_t step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; - int16_t step3_29, step3_30, step3_31; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int step1_28, step1_29, step1_30, step1_31; + int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int step2_28, step2_29, step2_30, step2_31; + int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int step3_29, step3_30, step3_31; int temp0, temp1, temp2, temp3; int load1, load2, load3, load4; int result1, result2; - int temp21; int i; const int const_2_power_13 = 8192; const int32_t *input_int; @@ -147,9 +146,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), - [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), - [step1_31] "=r"(step1_31) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17), + [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), @@ -207,9 +206,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), - [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), - [step1_29] "=r"(step1_29) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19), + [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), @@ -267,9 +266,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), - [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), - [step1_27] "=r"(step1_27) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), @@ -289,7 +288,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac1, %[load1], %[cospi_19_64] \n\t" "msub $ac1, %[load2], %[cospi_13_64] \n\t" "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_13_64] \n\t" "madd $ac3, %[load2], %[cospi_19_64] \n\t" "extp %[temp3], $ac3, 31 \n\t" @@ -302,7 +300,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac2, %[load3], %[cospi_3_64] \n\t" "msub $ac2, %[load4], %[cospi_29_64] \n\t" "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_29_64] \n\t" "madd $ac1, %[load4], %[cospi_3_64] \n\t" "extp %[temp2], $ac1, 31 \n\t" @@ -314,12 +311,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[load1], %[temp1], %[temp0] \n\t" "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_12_64] \n\t" "msub $ac1, %[load2], %[cospi_20_64] \n\t" "msub $ac3, %[load1], %[cospi_20_64] \n\t" "madd $ac3, %[load2], %[cospi_12_64] \n\t" - "extp %[step1_22], $ac1, 31 \n\t" "extp %[step1_25], $ac3, 31 \n\t" "add %[step1_23], %[temp0], %[temp1] \n\t" @@ -327,9 +322,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), - [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), - [step1_25] "=r"(step1_25) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), @@ -349,7 +344,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac1, %[load1], %[cospi_30_64] \n\t" "msub $ac1, %[load2], %[cospi_2_64] \n\t" "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_2_64] \n\t" "madd $ac3, %[load2], %[cospi_30_64] \n\t" "extp %[temp3], $ac3, 31 \n\t" @@ -362,7 +356,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac2, %[load3], %[cospi_14_64] \n\t" "msub $ac2, %[load4], %[cospi_18_64] \n\t" "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_18_64] \n\t" "madd $ac1, %[load4], %[cospi_14_64] \n\t" "extp %[temp2], $ac1, 31 \n\t" @@ -374,12 +367,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[load1], %[temp0], %[temp1] \n\t" "sub %[load2], %[temp3], %[temp2] \n\t" - "msub $ac1, %[load1], %[cospi_8_64] \n\t" "madd $ac1, %[load2], %[cospi_24_64] \n\t" "madd $ac3, %[load1], %[cospi_24_64] \n\t" "madd $ac3, %[load2], %[cospi_8_64] \n\t" - "extp %[step2_9], $ac1, 31 \n\t" "extp %[step2_14], $ac3, 31 \n\t" "add %[step2_8], %[temp0], %[temp1] \n\t" @@ -387,9 +378,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), - [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), - [step2_15] "=r"(step2_15) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8), + [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14), + [step2_15] "=&r"(step2_15) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), @@ -409,7 +400,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac1, %[load1], %[cospi_22_64] \n\t" "msub $ac1, %[load2], %[cospi_10_64] \n\t" "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_10_64] \n\t" "madd $ac3, %[load2], %[cospi_22_64] \n\t" "extp %[temp3], $ac3, 31 \n\t" @@ -422,7 +412,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac2, %[load3], %[cospi_6_64] \n\t" "msub $ac2, %[load4], %[cospi_26_64] \n\t" "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_26_64] \n\t" "madd $ac1, %[load4], %[cospi_6_64] \n\t" "extp %[temp2], $ac1, 31 \n\t" @@ -434,12 +423,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[load1], %[temp1], %[temp0] \n\t" "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_24_64] \n\t" "msub $ac1, %[load2], %[cospi_8_64] \n\t" "madd $ac3, %[load2], %[cospi_24_64] \n\t" "msub $ac3, %[load1], %[cospi_8_64] \n\t" - "extp %[step2_10], $ac1, 31 \n\t" "extp %[step2_13], $ac3, 31 \n\t" "add %[step2_11], %[temp0], %[temp1] \n\t" @@ -447,9 +434,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11), + [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), @@ -462,21 +449,18 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[temp0], %[temp0], %[step2_9] \n\t" "add %[temp0], %[temp0], %[step2_10] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" "sub %[temp1], %[step2_14], %[step2_13] \n\t" "add %[temp1], %[temp1], %[step2_9] \n\t" "sub %[temp1], %[temp1], %[step2_10] \n\t" "madd $ac1, %[temp1], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" "mthi $zero, $ac2 \n\t" "sub %[temp0], %[step2_15], %[step2_12] \n\t" "sub %[temp0], %[temp0], %[step2_8] \n\t" "add %[temp0], %[temp0], %[step2_11] \n\t" "madd $ac2, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" "mthi $zero, $ac3 \n\t" "sub %[temp1], %[step2_15], %[step2_12] \n\t" @@ -488,122 +472,159 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step3_9], %[step2_9], %[step2_10] \n\t" "add %[step3_14], %[step2_13], %[step2_14] \n\t" "add %[step3_15], %[step2_12], %[step2_15] \n\t" - "extp %[step3_10], $ac0, 31 \n\t" "extp %[step3_13], $ac1, 31 \n\t" "extp %[step3_11], $ac2, 31 \n\t" "extp %[step3_12], $ac3, 31 \n\t" - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), - [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), - [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), - [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), - [step3_15] "=r"(step3_15) + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8), + [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10), + [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12), + [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14), + [step3_15] "=&r"(step3_15) : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" - - : [step3_18] "=r"(step3_18) - : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), - [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" - - : [step3_19] "=r"(step3_19) - : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), - [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r"(step3_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" + "sub %[temp0], %[step1_17], %[step1_18] \n\t" + "sub %[temp1], %[step1_30], %[step1_29] \n\t" + "add %[step3_17], %[step1_17], %[step1_18] \n\t" + "add %[step3_30], %[step1_30], %[step1_29] \n\t" - : [step3_21] "=r"(step3_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), - [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_29], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29), + [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30) + : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17), + [step1_18] "r"(step1_18), [step1_30] "r"(step1_30), + [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_16], %[step1_19] \n\t" + "sub %[temp1], %[step1_31], %[step1_28] \n\t" + "add %[step3_16], %[step1_16], %[step1_19] \n\t" + "add %[step3_31], %[step1_31], %[step1_28] \n\t" - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_28], $ac1, 31 \n\t" - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31), + [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28) + : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16), + [step1_19] "r"(step1_19), [step1_31] "r"(step1_31), + [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_23], %[step1_20] \n\t" + "sub %[temp1], %[step1_24], %[step1_27] \n\t" + "add %[step3_23], %[step1_23], %[step1_20] \n\t" + "add %[step3_24], %[step1_24], %[step1_27] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_27], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_24_64] \n\t" + "msub $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24), + [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27) + : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23), + [step1_20] "r"(step1_20), [step1_24] "r"(step1_24), + [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_22], %[step1_21] \n\t" + "sub %[temp1], %[step1_25], %[step1_26] \n\t" + "add %[step3_22], %[step1_22], %[step1_21] \n\t" + "add %[step3_25], %[step1_25], %[step1_26] \n\t" + + "msub $ac0, %[temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_26], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25), + [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26) + : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22), + [step1_21] "r"(step1_21), [step1_25] "r"(step1_25), + [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "add %[step2_16], %[step3_16], %[step3_23] \n\t" + "add %[step2_17], %[step3_17], %[step3_22] \n\t" + "add %[step2_18], %[step3_18], %[step3_21] \n\t" + "add %[step2_19], %[step3_19], %[step3_20] \n\t" + "sub %[step2_20], %[step3_19], %[step3_20] \n\t" + "sub %[step2_21], %[step3_18], %[step3_21] \n\t" + "sub %[step2_22], %[step3_17], %[step3_22] \n\t" + "sub %[step2_23], %[step3_16], %[step3_23] \n\t" + + : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17), + [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19), + [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21), + [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23) + : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23), + [step3_17] "r"(step3_17), [step3_22] "r"(step3_22), + [step3_18] "r"(step3_18), [step3_21] "r"(step3_21), + [step3_19] "r"(step3_19), [step3_20] "r"(step3_20)); + + __asm__ __volatile__( + "sub %[step2_24], %[step3_31], %[step3_24] \n\t" + "sub %[step2_25], %[step3_30], %[step3_25] \n\t" + "sub %[step2_26], %[step3_29], %[step3_26] \n\t" + "sub %[step2_27], %[step3_28], %[step3_27] \n\t" + "add %[step2_28], %[step3_28], %[step3_27] \n\t" + "add %[step2_29], %[step3_29], %[step3_26] \n\t" + "add %[step2_30], %[step3_30], %[step3_25] \n\t" + "add %[step2_31], %[step3_31], %[step3_24] \n\t" + + : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28), + [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29), + [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30), + [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31) + : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24), + [step3_30] "r"(step3_30), [step3_25] "r"(step3_25), + [step3_29] "r"(step3_29), [step3_26] "r"(step3_26), + [step3_28] "r"(step3_28), [step3_27] "r"(step3_27)); __asm__ __volatile__( "lh %[load1], 0(%[input]) \n\t" @@ -627,29 +648,25 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac3, %[load3], %[cospi_24_64] \n\t" "msub $ac3, %[load4], %[cospi_8_64] \n\t" "extp %[temp2], $ac3, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" "madd $ac1, %[load3], %[cospi_8_64] \n\t" "madd $ac1, %[load4], %[cospi_24_64] \n\t" "extp %[temp3], $ac1, 31 \n\t" - - "add %[step1_0], %[temp0], %[temp3] \n\t" - "add %[step1_1], %[temp1], %[temp2] \n\t" - "sub %[step1_2], %[temp1], %[temp2] \n\t" - "sub %[step1_3], %[temp0], %[temp3] \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [result1] "=&r"(result1), [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0), + [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), + [step1_3] "=&r"(step1_3) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64) - - ); + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); __asm__ __volatile__( "lh %[load1], 8(%[input]) \n\t" @@ -665,7 +682,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac1, %[load1], %[cospi_28_64] \n\t" "msub $ac1, %[load2], %[cospi_4_64] \n\t" "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" "madd $ac3, %[load2], %[cospi_28_64] \n\t" "extp %[temp3], $ac3, 31 \n\t" @@ -678,7 +694,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac2, %[load3], %[cospi_12_64] \n\t" "msub $ac2, %[load4], %[cospi_20_64] \n\t" "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_20_64] \n\t" "madd $ac1, %[load4], %[cospi_12_64] \n\t" "extp %[temp2], $ac1, 31 \n\t" @@ -691,11 +706,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[load1], %[temp3], %[temp2] \n\t" "sub %[load1], %[load1], %[temp0] \n\t" "add %[load1], %[load1], %[temp1] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" "sub %[load2], %[load2], %[temp2] \n\t" "add %[load2], %[load2], %[temp3] \n\t" - "madd $ac1, %[load1], %[cospi_16_64] \n\t" "madd $ac3, %[load2], %[cospi_16_64] \n\t" @@ -706,129 +719,246 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4), + [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6), + [step1_7] "=&r"(step1_7) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), [cospi_16_64] "r"(cospi_16_64)); - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; + __asm__ __volatile__( + "add %[step2_0], %[step1_0], %[step1_7] \n\t" + "add %[step2_1], %[step1_1], %[step1_6] \n\t" + "add %[step2_2], %[step1_2], %[step1_5] \n\t" + "add %[step2_3], %[step1_3], %[step1_4] \n\t" + "sub %[step2_4], %[step1_3], %[step1_4] \n\t" + "sub %[step2_5], %[step1_2], %[step1_5] \n\t" + "sub %[step2_6], %[step1_1], %[step1_6] \n\t" + "sub %[step2_7], %[step1_0], %[step1_7] \n\t" - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; + : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4), + [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5), + [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6), + [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7) + : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7), + [step1_1] "r"(step1_1), [step1_6] "r"(step1_6), + [step1_2] "r"(step1_2), [step1_5] "r"(step1_5), + [step1_3] "r"(step1_3), [step1_4] "r"(step1_4)); + + // stage 7 + __asm__ __volatile__( + "add %[step1_0], %[step2_0], %[step3_15] \n\t" + "add %[step1_1], %[step2_1], %[step3_14] \n\t" + "add %[step1_2], %[step2_2], %[step3_13] \n\t" + "add %[step1_3], %[step2_3], %[step3_12] \n\t" + "sub %[step1_12], %[step2_3], %[step3_12] \n\t" + "sub %[step1_13], %[step2_2], %[step3_13] \n\t" + "sub %[step1_14], %[step2_1], %[step3_14] \n\t" + "sub %[step1_15], %[step2_0], %[step3_15] \n\t" + + : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12), + [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13), + [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14), + [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15) + : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15), + [step2_1] "r"(step2_1), [step3_14] "r"(step3_14), + [step2_2] "r"(step2_2), [step3_13] "r"(step3_13), + [step2_3] "r"(step2_3), [step3_12] "r"(step3_12)); + + __asm__ __volatile__( + "add %[step1_4], %[step2_4], %[step3_11] \n\t" + "add %[step1_5], %[step2_5], %[step3_10] \n\t" + "add %[step1_6], %[step2_6], %[step3_9] \n\t" + "add %[step1_7], %[step2_7], %[step3_8] \n\t" + "sub %[step1_8], %[step2_7], %[step3_8] \n\t" + "sub %[step1_9], %[step2_6], %[step3_9] \n\t" + "sub %[step1_10], %[step2_5], %[step3_10] \n\t" + "sub %[step1_11], %[step2_4], %[step3_11] \n\t" + + : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8), + [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9), + [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10), + [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11) + : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11), + [step2_5] "r"(step2_5), [step3_10] "r"(step3_10), + [step2_6] "r"(step2_6), [step3_9] "r"(step3_9), + [step2_7] "r"(step2_7), [step3_8] "r"(step3_8)); __asm__ __volatile__( "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "add %[temp1], %[step2_27], %[step2_20] \n\t" + "sub %[temp2], %[step2_26], %[step2_21] \n\t" + "add %[temp3], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + "extp %[step1_27], $ac1, 31 \n\t" + "extp %[step1_21], $ac2, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" - : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20), + [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26) : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), - [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + [step2_27] "r"(step2_27), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64)); __asm__ __volatile__( "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "add %[temp1], %[step2_25], %[step2_22] \n\t" + "sub %[temp2], %[step2_24], %[step2_23] \n\t" + "add %[temp3], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + "extp %[step1_25], $ac1, 31 \n\t" + "extp %[step1_23], $ac2, 31 \n\t" + "extp %[step1_24], $ac3, 31 \n\t" - : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) - : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), - [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) - : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), - [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22), + [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24) + : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22), + [step2_25] "r"(step2_25), [step2_23] "r"(step2_23), + [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64)); // final stage - output[0 * 32] = step1_0 + step2_31; - output[1 * 32] = step1_1 + step2_30; - output[2 * 32] = step1_2 + step2_29; - output[3 * 32] = step1_3 + step2_28; - output[4 * 32] = step1_4 + step1_27; - output[5 * 32] = step1_5 + step1_26; - output[6 * 32] = step1_6 + step1_25; - output[7 * 32] = step1_7 + step1_24; - output[8 * 32] = step1_8 + step1_23; - output[9 * 32] = step1_9 + step1_22; - output[10 * 32] = step1_10 + step1_21; - output[11 * 32] = step1_11 + step1_20; - output[12 * 32] = step1_12 + step2_19; - output[13 * 32] = step1_13 + step2_18; - output[14 * 32] = step1_14 + step2_17; - output[15 * 32] = step1_15 + step2_16; - output[16 * 32] = step1_15 - step2_16; - output[17 * 32] = step1_14 - step2_17; - output[18 * 32] = step1_13 - step2_18; - output[19 * 32] = step1_12 - step2_19; - output[20 * 32] = step1_11 - step1_20; - output[21 * 32] = step1_10 - step1_21; - output[22 * 32] = step1_9 - step1_22; - output[23 * 32] = step1_8 - step1_23; - output[24 * 32] = step1_7 - step1_24; - output[25 * 32] = step1_6 - step1_25; - output[26 * 32] = step1_5 - step1_26; - output[27 * 32] = step1_4 - step1_27; - output[28 * 32] = step1_3 - step2_28; - output[29 * 32] = step1_2 - step2_29; - output[30 * 32] = step1_1 - step2_30; - output[31 * 32] = step1_0 - step2_31; + __asm__ __volatile__( + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "add %[temp2], %[step1_2], %[step2_29] \n\t" + "add %[temp3], %[step1_3], %[step2_28] \n\t" + "sub %[load1], %[step1_3], %[step2_28] \n\t" + "sub %[load2], %[step1_2], %[step2_29] \n\t" + "sub %[load3], %[step1_1], %[step2_30] \n\t" + "sub %[load4], %[step1_0], %[step2_31] \n\t" + "sh %[temp0], 0(%[output]) \n\t" + "sh %[temp1], 64(%[output]) \n\t" + "sh %[temp2], 128(%[output]) \n\t" + "sh %[temp3], 192(%[output]) \n\t" + "sh %[load1], 1792(%[output]) \n\t" + "sh %[load2], 1856(%[output]) \n\t" + "sh %[load3], 1920(%[output]) \n\t" + "sh %[load4], 1984(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31), + [step1_1] "r"(step1_1), [step2_30] "r"(step2_30), + [step1_2] "r"(step1_2), [step2_29] "r"(step2_29), + [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), + [output] "r"(output)); + + __asm__ __volatile__( + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "add %[temp2], %[step1_6], %[step1_25] \n\t" + "add %[temp3], %[step1_7], %[step1_24] \n\t" + "sub %[load1], %[step1_7], %[step1_24] \n\t" + "sub %[load2], %[step1_6], %[step1_25] \n\t" + "sub %[load3], %[step1_5], %[step1_26] \n\t" + "sub %[load4], %[step1_4], %[step1_27] \n\t" + "sh %[temp0], 256(%[output]) \n\t" + "sh %[temp1], 320(%[output]) \n\t" + "sh %[temp2], 384(%[output]) \n\t" + "sh %[temp3], 448(%[output]) \n\t" + "sh %[load1], 1536(%[output]) \n\t" + "sh %[load2], 1600(%[output]) \n\t" + "sh %[load3], 1664(%[output]) \n\t" + "sh %[load4], 1728(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27), + [step1_5] "r"(step1_5), [step1_26] "r"(step1_26), + [step1_6] "r"(step1_6), [step1_25] "r"(step1_25), + [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), + [output] "r"(output)); + + __asm__ __volatile__( + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "add %[temp2], %[step1_10], %[step1_21] \n\t" + "add %[temp3], %[step1_11], %[step1_20] \n\t" + "sub %[load1], %[step1_11], %[step1_20] \n\t" + "sub %[load2], %[step1_10], %[step1_21] \n\t" + "sub %[load3], %[step1_9], %[step1_22] \n\t" + "sub %[load4], %[step1_8], %[step1_23] \n\t" + "sh %[temp0], 512(%[output]) \n\t" + "sh %[temp1], 576(%[output]) \n\t" + "sh %[temp2], 640(%[output]) \n\t" + "sh %[temp3], 704(%[output]) \n\t" + "sh %[load1], 1280(%[output]) \n\t" + "sh %[load2], 1344(%[output]) \n\t" + "sh %[load3], 1408(%[output]) \n\t" + "sh %[load4], 1472(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23), + [step1_9] "r"(step1_9), [step1_22] "r"(step1_22), + [step1_10] "r"(step1_10), [step1_21] "r"(step1_21), + [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), + [output] "r"(output)); + + __asm__ __volatile__( + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "add %[temp2], %[step1_14], %[step2_17] \n\t" + "add %[temp3], %[step1_15], %[step2_16] \n\t" + "sub %[load1], %[step1_15], %[step2_16] \n\t" + "sub %[load2], %[step1_14], %[step2_17] \n\t" + "sub %[load3], %[step1_13], %[step2_18] \n\t" + "sub %[load4], %[step1_12], %[step2_19] \n\t" + "sh %[temp0], 768(%[output]) \n\t" + "sh %[temp1], 832(%[output]) \n\t" + "sh %[temp2], 896(%[output]) \n\t" + "sh %[temp3], 960(%[output]) \n\t" + "sh %[load1], 1024(%[output]) \n\t" + "sh %[load2], 1088(%[output]) \n\t" + "sh %[load3], 1152(%[output]) \n\t" + "sh %[load4], 1216(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19), + [step1_13] "r"(step1_13), [step2_18] "r"(step2_18), + [step1_14] "r"(step1_14), [step2_17] "r"(step2_17), + [step1_15] "r"(step1_15), [step2_16] "r"(step2_16), + [output] "r"(output)); input += 32; output += 1; @@ -836,7 +966,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, } void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { + int stride) { DECLARE_ALIGNED(32, int16_t, out[32 * 32]); int16_t *outptr = out; uint32_t pos = 45; @@ -850,7 +980,7 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, idct32_rows_dspr2(input, outptr, 32); // Columns - vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride); + vpx_idct32_cols_add_blk_dspr2(out, dest, stride); } void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, @@ -941,7 +1071,7 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, "abs %[absa1], %[a1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t" - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1) : [a1] "r"(a1)); for (r = 32; r--;) { @@ -980,12 +1110,71 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, [dest] "+&r"(dest) : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 1; + a12 = a1 - a11; + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); + } } else { /* use quad-byte * input and output memory are four byte aligned */ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r"(vector_a1) + : [vector_a1] "=&r"(vector_a1) : [a1] "r"(a1)); for (r = 32; r--;) { diff --git a/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c index 516ea80f4a..e214b538d4 100644 --- a/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c @@ -15,7 +15,7 @@ #if HAVE_DSPR2 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { - int16_t step_0, step_1, step_2, step_3; + int step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; int i; @@ -96,23 +96,13 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { } void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step_0, step_1, step_2, step_3; + int stride) { + int step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; + const int const_255 = 255; int i; uint8_t *dest_pix; - uint8_t *cm = vpx_ff_cropTbl; - - /* prefetch vpx_ff_cropTbl */ - prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); - prefetch_load(vpx_ff_cropTbl + 128); - prefetch_load(vpx_ff_cropTbl + 160); - prefetch_load(vpx_ff_cropTbl + 192); - prefetch_load(vpx_ff_cropTbl + 224); for (i = 0; i < 4; ++i) { dest_pix = (dest + i); @@ -172,51 +162,62 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, "sra %[Temp0], %[Temp0], 4 \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "add %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "addi %[Temp0], %[Temp0], 8 \n\t" "sra %[Temp0], %[Temp0], 4 \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "sub %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "addi %[Temp0], %[Temp0], 8 \n\t" "sra %[Temp0], %[Temp0], 4 \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "sub %[Temp0], %[step_0], %[step_3] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "addi %[Temp0], %[Temp0], 8 \n\t" "sra %[Temp0], %[Temp0], 4 \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [dest_pix] "+r"(dest_pix) - : [const_2_power_13] "r"(const_2_power_13), + : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255), [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), - [dest_stride] "r"(dest_stride)); + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), + [stride] "r"(stride)); input += 4; } } -void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { DECLARE_ALIGNED(32, int16_t, out[4 * 4]); int16_t *outptr = out; uint32_t pos = 45; @@ -230,11 +231,10 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, vpx_idct4_rows_dspr2(input, outptr); // Columns - vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride); } -void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { int a1, absa1; int r; int32_t out; @@ -271,10 +271,43 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, "lw %[t2], 0(%[dest]) \n\t" "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 3; + a12 = a1 - (a11 * 7); + + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a12] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); } } else { /* use quad-byte @@ -288,10 +321,10 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, "lw %[t2], 0(%[dest]) \n\t" "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } } } @@ -310,6 +343,7 @@ void iadst4_dspr2(const int16_t *input, int16_t *output) { return; } + // 32-bit result is enough for the following multiplications. s0 = sinpi_1_9 * x0; s1 = sinpi_2_9 * x0; s2 = sinpi_3_9 * x1; diff --git a/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c index 08a6c78b6e..d4d246965c 100644 --- a/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c @@ -192,24 +192,13 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { } } -void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int Temp0, Temp1, Temp2, Temp3; int i; const int const_2_power_13 = 8192; + const int const_255 = 255; uint8_t *dest_pix; - uint8_t *cm = vpx_ff_cropTbl; - - /* prefetch vpx_ff_cropTbl */ - prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); - prefetch_load(vpx_ff_cropTbl + 128); - prefetch_load(vpx_ff_cropTbl + 160); - prefetch_load(vpx_ff_cropTbl + 192); - prefetch_load(vpx_ff_cropTbl + 224); for (i = 0; i < 8; ++i) { dest_pix = (dest + i); @@ -356,70 +345,94 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "add %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "add %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "add %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "sub %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "sub %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "sub %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "sub %[Temp0], %[step1_0], %[step1_7] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), @@ -427,19 +440,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) - : [const_2_power_13] "r"(const_2_power_13), + : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255), [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), - [dest_stride] "r"(dest_stride)); + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), + [stride] "r"(stride)); input += 8; } } -void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { DECLARE_ALIGNED(32, int16_t, out[8 * 8]); int16_t *outptr = out; uint32_t pos = 45; @@ -451,11 +463,10 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, idct8_rows_dspr2(input, outptr, 8); // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, stride); } -void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { DECLARE_ALIGNED(32, int16_t, out[8 * 8]); int16_t *outptr = out; uint32_t pos = 45; @@ -490,11 +501,10 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, : [outptr] "r"(outptr)); // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, stride); } -void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { uint32_t pos = 45; int32_t out; int32_t r; @@ -533,11 +543,47 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" "sw %[vector_1], 0(%[dest]) \n\t" "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 2; + a12 = a1 - (a11 * 3); + + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); } } else { /* use quad-byte @@ -555,11 +601,11 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" "sw %[vector_1], 0(%[dest]) \n\t" "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), [dest] "+r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } } } diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c index 2c8a10fc5d..b1731f2345 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c @@ -8,13 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_ports/mem.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" +#include "vpx_ports/mem.h" -int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -77,7 +79,7 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, } } -void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { +static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { v16u8 flat, flat2, filter8; v16i8 zero = { 0 }; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -403,20 +405,21 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { } } -void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, int32_t count) { +static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); uint8_t early_exit = 0; (void)count; - early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, - limit_ptr, thresh_ptr); + early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); if (0 == early_exit) { - vpx_hz_lpf_t16_16w(src, pitch, filter48); + hz_lpf_t16_16w(src, pitch, filter48); } } @@ -448,7 +451,7 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); @@ -638,19 +641,19 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, } } } else { - vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr, - thresh_ptr, count); + mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, + count); } } -void vpx_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); } -void vpx_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch, +void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr) { @@ -752,11 +755,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); } -int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch_org, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -778,7 +781,7 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, /* flat4 */ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); @@ -819,8 +822,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, } } -int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { +static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { v16i8 zero = { 0 }; v16u8 filter8, flat, flat2; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -1050,12 +1053,12 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); early_exit = - vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); + vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch, + b_limit_ptr, limit_ptr, thresh_ptr); if (0 == early_exit) { - early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); + early_exit = + vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]); if (0 == early_exit) { transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); @@ -1063,11 +1066,11 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, } } -int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -1140,8 +1143,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, } } -int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { +static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { v16u8 flat, flat2, filter8; v16i8 zero = { 0 }; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -1472,12 +1475,12 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, transpose_16x16((src - 8), pitch, &transposed_input[0], 16); early_exit = - vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); + vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); if (0 == early_exit) { - early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); + early_exit = + vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); if (0 == early_exit) { transpose_16x16(transposed_input, 16, (src - 8), pitch); diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c index e0079665f7..0eff2b6ca9 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, @@ -27,7 +28,7 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p0_d = __msa_copy_u_d((v2i64)p0_out, 0); @@ -86,7 +87,7 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec2, vec3); diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c index 403e5dc51b..703fcce8a7 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, @@ -32,7 +33,7 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); @@ -177,7 +178,7 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, /* flat4 */ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c index 6325762a2a..9c1f5143f2 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -717,14 +717,13 @@ static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, } } -void vpx_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { +void vpx_lpf_horizontal_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); } -void vpx_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch, +void vpx_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h index 4063e5e6b8..49fd74c25a 100644 --- a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h +++ b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h @@ -13,144 +13,71 @@ #include "vpx_dsp/mips/macros_msa.h" -#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - filt = filt & (v16i8)hev_in; \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - /* combine left and right part */ \ - filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ - \ - filt = filt & (v16i8)mask_in; \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ + p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt &= hev; \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + t1 = __msa_adds_s_b(filt, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(filt, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + filt = __msa_srari_b(t1, 1); \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ } -#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - filt = filt & (v16i8)hev_in; \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask_in; \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ - } - -#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ - { \ - v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ - v16u8 zero_in = { 0 }; \ - \ - tmp = __msa_ori_b(zero_in, 1); \ - p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ - q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ - p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ - q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ - \ - p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ - flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ - p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ - flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ - \ - flat_out = (tmp < (v16u8)flat_out); \ - flat_out = __msa_xori_b(flat_out, 0xff); \ - flat_out = flat_out & (mask); \ +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp_flat4 = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp_flat4 < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ } #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ q6_in, q7_in, flat_in, flat2_out) \ { \ - v16u8 tmp, zero_in = { 0 }; \ + v16u8 tmp_flat5, zero_in = { 0 }; \ v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ \ - tmp = __msa_ori_b(zero_in, 1); \ + tmp_flat5 = __msa_ori_b(zero_in, 1); \ p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ @@ -168,7 +95,7 @@ p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ \ - flat2_out = (tmp < (v16u8)flat2_out); \ + flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ flat2_out = __msa_xori_b(flat2_out, 0xff); \ flat2_out = flat2_out & flat_in; \ } @@ -177,38 +104,38 @@ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ q1_filt8_out, q2_filt8_out) \ { \ - v8u16 tmp0, tmp1, tmp2; \ + v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ \ - tmp2 = p2_in + p1_in + p0_in; \ - tmp0 = p3_in << 1; \ + tmp_filt8_2 = p2_in + p1_in + p0_in; \ + tmp_filt8_0 = p3_in << 1; \ \ - tmp0 = tmp0 + tmp2 + q0_in; \ - tmp1 = tmp0 + p3_in + p2_in; \ - p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ + tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = tmp0 + p1_in + q1_in; \ - p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = q2_in + q1_in + q0_in; \ - tmp2 = tmp2 + tmp1; \ - tmp0 = tmp2 + (p0_in); \ - tmp0 = tmp0 + (p3_in); \ - p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \ + tmp_filt8_1 = q2_in + q1_in + q0_in; \ + tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ + tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ + tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ \ - tmp0 = q2_in + q3_in; \ - tmp0 = p0_in + tmp1 + tmp0; \ - tmp1 = q3_in + q3_in; \ - tmp1 = tmp1 + tmp0; \ - q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = q2_in + q3_in; \ + tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ + tmp_filt8_1 = q3_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp0 = tmp2 + q3_in; \ - tmp1 = tmp0 + q0_in; \ - q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = tmp_filt8_2 + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = tmp0 - p2_in; \ - tmp0 = q1_in + q3_in; \ - tmp1 = tmp0 + tmp1; \ - q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_1 = tmp_filt8_0 - p2_in; \ + tmp_filt8_0 = q1_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ } #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ diff --git a/libs/libvpx/vpx_dsp/mips/macros_msa.h b/libs/libvpx/vpx_dsp/mips/macros_msa.h index f498fbe9de..f9a446e7bc 100644 --- a/libs/libvpx/vpx_dsp/mips/macros_msa.h +++ b/libs/libvpx/vpx_dsp/mips/macros_msa.h @@ -16,207 +16,149 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) -#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) +#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_V(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_V(v16i8, __VA_ARGS__) +#define LD_UH(...) LD_V(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_V(v8i16, __VA_ARGS__) +#define LD_SW(...) LD_V(v4i32, __VA_ARGS__) -#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) -#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) - -#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) - -#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) -#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) - -#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) - -#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) +#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_V(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_V(v16i8, __VA_ARGS__) +#define ST_SH(...) ST_V(v8i16, __VA_ARGS__) +#define ST_SW(...) ST_V(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LH(psrc) \ + ({ \ + uint16_t val_lh_m = *(const uint16_t *)(psrc); \ + val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + uint32_t val_lw_m = *(const uint32_t *)(psrc); \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + uint64_t val_ld_m = *(const uint64_t *)(psrc); \ + val_ld_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint32_t val0_ld_m, val1_ld_m; \ + uint64_t val_ld_m = 0; \ + \ + val0_ld_m = LW(psrc_ld_m); \ + val1_ld_m = LW(psrc_ld_m + 4); \ + \ + val_ld_m = (uint64_t)(val1_ld_m); \ + val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ + val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ + \ + val_ld_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } +#define SH(val, pdst) *(uint16_t *)(pdst) = (val); +#define SW(val, pdst) *(uint32_t *)(pdst) = (val); +#define SD(val, pdst) *(uint64_t *)(pdst) = (val); #else // !(__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_lh_m = (const uint8_t *)(psrc); \ + uint16_t val_lh_m; \ + \ + __asm__ __volatile__("ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ + \ + : [val_lh_m] "=r"(val_lh_m) \ + : [psrc_lh_m] "m"(*psrc_lh_m)); \ + \ + val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ + uint32_t val_lw_m; \ + \ + __asm__ __volatile__("ulw %[val_lw_m], %[psrc_lw_m] \n\t" \ + \ + : [val_lw_m] "=r"(val_lw_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint64_t val_ld_m = 0; \ + \ + __asm__ __volatile__("uld %[val_ld_m], %[psrc_ld_m] \n\t" \ + \ + : [val_ld_m] "=r"(val_ld_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + \ + val_ld_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint32_t val0_ld_m, val1_ld_m; \ + uint64_t val_ld_m = 0; \ + \ + val0_ld_m = LW(psrc_ld_m); \ + val1_ld_m = LW(psrc_ld_m + 4); \ + \ + val_ld_m = (uint64_t)(val1_ld_m); \ + val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ + val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ + \ + val_ld_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *)(pdst); \ + const uint16_t val_sh_m = (val); \ + \ + __asm__ __volatile__("ush %[val_sh_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m"(*pdst_sh_m) \ + : [val_sh_m] "r"(val_sh_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *)(pdst); \ + const uint32_t val_sw_m = (val); \ + \ + __asm__ __volatile__("usw %[val_sw_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_sw_m] "r"(val_sw_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ +#define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *)(pdst); \ + uint32_t val0_sd_m, val1_sd_m; \ + \ + val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_sd_m, pdst_sd_m); \ + SW(val1_sd_m, pdst_sd_m + 4); \ } #endif // (__mips_isa_rev >= 6) @@ -283,97 +225,73 @@ SD(in3, (pdst) + 3 * stride); \ } -/* Description : Load vectors with 16 byte elements with stride +/* Description : Load vector elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - as per RTYPE Details : Load 16 byte elements in 'out0' from (psrc) Load 16 byte elements in 'out1' from (psrc + stride) */ -#define LD_B2(RTYPE, psrc, stride, out0, out1) \ +#define LD_V2(RTYPE, psrc, stride, out0, out1) \ { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ + out0 = LD_V(RTYPE, (psrc)); \ + out1 = LD_V(RTYPE, (psrc) + stride); \ } -#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) -#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) +#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) +#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) +#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) -#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ +#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + LD_V2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ } -#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) +#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ +#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + LD_V2(RTYPE, (psrc), stride, out0, out1); \ + LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ } -#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) -#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) +#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) +#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) -#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ +#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ } -#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) -#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) +#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) -#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ +#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ { \ - LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ - LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ } -#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) +#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) -#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ +#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ out7) \ { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ } -#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) -#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) +#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) +#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) -/* Description : Load vectors with 8 halfword elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load 8 halfword elements in 'out0' from (psrc) - Load 8 halfword elements in 'out1' from (psrc + stride) -*/ -#define LD_H2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_H(RTYPE, (psrc)); \ - out1 = LD_H(RTYPE, (psrc) + (stride)); \ - } -#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) - -#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_H2(RTYPE, (psrc), stride, out0, out1); \ - LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ - } -#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) - -#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7) \ - { \ - LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ - } -#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) - -#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ +#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ out7, out8, out9, out10, out11, out12, out13, out14, out15) \ { \ - LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ out7); \ - LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ out13, out14, out15); \ } -#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) +#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) /* Description : Load 4x4 block of signed halfword elements from 1D source data into 4 vectors (Each vector with 4 signed halfwords) @@ -388,79 +306,35 @@ out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ } -/* Description : Load 2 vectors of signed word elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - signed word -*/ -#define LD_SW2(psrc, stride, out0, out1) \ - { \ - out0 = LD_SW((psrc)); \ - out1 = LD_SW((psrc) + stride); \ - } - -/* Description : Store vectors of 16 byte elements with stride +/* Description : Store vectors with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 16 byte elements from 'in0' to (pdst) Store 16 byte elements from 'in1' to (pdst + stride) */ -#define ST_B2(RTYPE, in0, in1, pdst, stride) \ +#define ST_V2(RTYPE, in0, in1, pdst, stride) \ { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ + ST_V(RTYPE, in0, (pdst)); \ + ST_V(RTYPE, in1, (pdst) + stride); \ } -#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) +#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) +#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) +#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ +#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + ST_V2(RTYPE, in0, in1, (pdst), stride); \ + ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ } -#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) +#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) +#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) -#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ { \ - ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ - ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) - -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) -*/ -#define ST_H2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ - } -#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) - -#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_H2(RTYPE, in0, in1, (pdst), stride); \ - ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) - -#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ - { \ - ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ - ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) - -/* Description : Store vectors of word elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 4 word elements from 'in0' to (pdst) - Store 4 word elements from 'in1' to (pdst + stride) -*/ -#define ST_SW2(in0, in1, pdst, stride) \ - { \ - ST_SW(in0, (pdst)); \ - ST_SW(in1, (pdst) + stride); \ + ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ } +#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) +#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) /* Description : Store 2x4 byte block to destination memory from input vector Arguments : Inputs - in, stidx, pdst, stride @@ -681,6 +555,7 @@ #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) +#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ out3) \ @@ -909,27 +784,42 @@ sum_m; \ }) -/* Description : Horizontal addition of 8 unsigned halfword elements - Arguments : Inputs - in (unsigned halfword vector) - Outputs - sum_m (u32 sum) - Return Type - unsigned word - Details : 8 unsigned halfword elements of input vector are added - together and the resulting integer sum is returned +/* Description : Horizontal addition of 4 unsigned word elements + Arguments : Input - in (unsigned word vector) + Output - sum_m (u32 sum) + Return Type - unsigned word (GP) + Details : 4 unsigned word elements of 'in' vector are added together and + the resulting integer sum is returned */ -#define HADD_UH_U32(in) \ +#define HADD_UW_U32(in) \ ({ \ - v4u32 res_m; \ v2u64 res0_m, res1_m; \ uint32_t sum_m; \ \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - res0_m = __msa_hadd_u_d(res_m, res_m); \ + res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m = res0_m + res1_m; \ + res0_m += res1_m; \ sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ sum_m; \ }) +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Input - in (unsigned halfword vector) + Output - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of 'in' vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + sum_m = HADD_UW_U32(res_m); \ + sum_m; \ + }) + /* Description : Horizontal addition of unsigned byte vector elements Arguments : Inputs - in0, in1 Outputs - out0, out1 @@ -1034,6 +924,7 @@ } #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) +#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) /* Description : Interleave even byte elements from vectors Arguments : Inputs - in0, in1, in2, in3 @@ -1292,6 +1183,7 @@ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ } #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__) #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) @@ -1544,6 +1436,12 @@ Details : Each element of vector 'in0' is right shifted by 'shift' and the result is written in-place. 'shift' is a GP variable. */ +#define SRA_2V(in0, in1, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + } + #define SRA_4V(in0, in1, in2, in3, shift) \ { \ in0 = in0 >> shift; \ @@ -1699,6 +1597,25 @@ out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ } +/* Description : Sign extend byte elements from input vector and return + halfword results in pair of vectors + Arguments : Input - in (byte vector) + Outputs - out0, out1 (sign extended halfword vectors) + Return Type - signed halfword + Details : Sign bit of byte elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 8 signed halfword elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 8 signed halfword elements in 'out1' +*/ +#define UNPCK_SB_SH(in, out0, out1) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_clti_s_b((v16i8)in, 0); \ + ILVRL_B2_SH(tmp_m, in, out0, out1); \ + } + /* Description : Zero extend unsigned byte elements to halfword elements Arguments : Input - in (unsigned byte vector) Outputs - out0, out1 (unsigned halfword vectors) @@ -1857,8 +1774,6 @@ out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ \ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ @@ -2012,20 +1927,17 @@ /* Description : Converts inputs to unsigned bytes, interleave, average & store as 8x4 unsigned byte block - Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, - pdst, stride + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride */ -#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ - pdst, stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - \ - tmp0_m = PCKEV_XORI128_UB(in0, in1); \ - tmp1_m = PCKEV_XORI128_UB(in2, in3); \ - ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } /* Description : Pack even byte elements and store byte vector in destination diff --git a/libs/libvpx/vpx_dsp/mips/sad_mmi.c b/libs/libvpx/vpx_dsp/mips/sad_mmi.c new file mode 100644 index 0000000000..33bd3fe7f9 --- /dev/null +++ b/libs/libvpx/vpx_dsp/mips/sad_mmi.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define SAD_SRC_REF_ABS_SUB_64 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_32 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_16 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_8 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + +#if _MIPS_SIM == _ABIO32 +#define SAD_SRC_REF_ABS_SUB_4 \ + "ulw %[tmp0], 0x00(%[src]) \n\t" \ + "mtc1 %[tmp0], %[ftmp1] \n\t" \ + "ulw %[tmp0], 0x00(%[ref]) \n\t" \ + "mtc1 %[tmp0], %[ftmp2] \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */ +#define SAD_SRC_REF_ABS_SUB_4 \ + "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" \ + "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \ + "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#endif /* _MIPS_SIM == _ABIO32 */ + +#define SAD_SRC_AVGREF_ABS_SUB_64 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_32 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_16 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_8 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + +#if _MIPS_SIM == _ABIO32 +#define SAD_SRC_AVGREF_ABS_SUB_4 \ + "ulw %[tmp0], 0x00(%[second_pred]) \n\t" \ + "mtc1 %[tmp0], %[ftmp1] \n\t" \ + "ulw %[tmp0], 0x00(%[ref]) \n\t" \ + "mtc1 %[tmp0], %[ftmp2] \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */ +#define SAD_SRC_AVGREF_ABS_SUB_4 \ + "gslwlc1 %[ftmp1], 0x03(%[second_pred]) \n\t" \ + "gslwrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \ + "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#endif /* _MIPS_SIM == _ABIO32 */ + +// depending on call sites, pass **ref_array to avoid & in subsequent call and +// de-dup with 4D below. +#define sadMxNxK_mmi(m, n, k) \ + void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \ + } + +// This appears to be equivalent to the above when k == 4 and refs is const +#define sadMxNx4D_mmi(m, n) \ + void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \ + } + +static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_64 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_64 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad64xN(H) \ + unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad64x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad64xN(64); +vpx_sad64xN(32); +sadMxNx4D_mmi(64, 64); +sadMxNx4D_mmi(64, 32); + +static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_64 + MMI_ADDIU(%[second_pred], %[second_pred], 0x40) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_64 + MMI_ADDIU(%[second_pred], %[second_pred], 0x40) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg64xN(H) \ + unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg64xN(64); +vpx_sad_avg64xN(32); + +static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_32 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_32 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad32xN(H) \ + unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad32x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad32xN(64); +vpx_sad32xN(32); +vpx_sad32xN(16); +sadMxNx4D_mmi(32, 64); +sadMxNx4D_mmi(32, 32); +sadMxNx4D_mmi(32, 16); + +static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_32 + MMI_ADDIU(%[second_pred], %[second_pred], 0x20) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_32 + MMI_ADDIU(%[second_pred], %[second_pred], 0x20) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg32xN(H) \ + unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg32xN(64); +vpx_sad_avg32xN(32); +vpx_sad_avg32xN(16); + +static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_16 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_16 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad16xN(H) \ + unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad16x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad16xN(32); +vpx_sad16xN(16); +vpx_sad16xN(8); +sadMxNxK_mmi(16, 16, 3); +sadMxNxK_mmi(16, 16, 8); +sadMxNxK_mmi(16, 8, 3); +sadMxNxK_mmi(16, 8, 8); +sadMxNx4D_mmi(16, 32); +sadMxNx4D_mmi(16, 16); +sadMxNx4D_mmi(16, 8); + +static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_16 + MMI_ADDIU(%[second_pred], %[second_pred], 0x10) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_16 + MMI_ADDIU(%[second_pred], %[second_pred], 0x10) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg16xN(H) \ + unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg16xN(32); +vpx_sad_avg16xN(16); +vpx_sad_avg16xN(8); + +static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_8 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_8 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad8xN(H) \ + unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad8x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad8xN(16); +vpx_sad8xN(8); +vpx_sad8xN(4); +sadMxNxK_mmi(8, 16, 3); +sadMxNxK_mmi(8, 16, 8); +sadMxNxK_mmi(8, 8, 3); +sadMxNxK_mmi(8, 8, 8); +sadMxNx4D_mmi(8, 16); +sadMxNx4D_mmi(8, 8); +sadMxNx4D_mmi(8, 4); + +static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_8 + MMI_ADDIU(%[second_pred], %[second_pred], 0x08) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_8 + MMI_ADDIU(%[second_pred], %[second_pred], 0x08) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg8xN(H) \ + unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg8xN(16); +vpx_sad_avg8xN(8); +vpx_sad_avg8xN(4); + +static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_4 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_4 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad4xN(H) \ + unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad4x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad4xN(8); +vpx_sad4xN(4); +sadMxNxK_mmi(4, 4, 3); +sadMxNxK_mmi(4, 4, 8); +sadMxNx4D_mmi(4, 8); +sadMxNx4D_mmi(4, 4); + +static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + __asm__ volatile ( + "xor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_4 + MMI_ADDIU(%[second_pred], %[second_pred], 0x04) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_4 + MMI_ADDIU(%[second_pred], %[second_pred], 0x04) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"((mips_reg)second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + + return sad; +} + +#define vpx_sad_avg4xN(H) \ + unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg4xN(8); +vpx_sad_avg4xN(4); diff --git a/libs/libvpx/vpx_dsp/mips/sad_msa.c b/libs/libvpx/vpx_dsp/mips/sad_msa.c index 6455814e1b..ab681ae9f8 100644 --- a/libs/libvpx/vpx_dsp/mips/sad_msa.c +++ b/libs/libvpx/vpx_dsp/mips/sad_msa.c @@ -283,96 +283,6 @@ static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, sad_array[2] = HADD_UH_U32(sad2); } -static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = height >> 1; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v4u32 sad; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); - ref0_4 = LD_UB(ref + 64); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32((v4i32)sad); -} - static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, int32_t height, uint32_t *sad_array) { @@ -623,176 +533,6 @@ static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, sad_array[7] = HADD_UH_U32(sad7); } -static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1; - v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - const uint8_t *src_dup, *ref_dup; - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v8u16 sad3_0 = { 0 }; - v8u16 sad3_1 = { 0 }; - v4u32 sad; - - src_dup = src; - ref_dup = ref; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[3] = HADD_SW_S32(sad); - - sad0_0 = (v8u16)__msa_ldi_h(0); - sad0_1 = (v8u16)__msa_ldi_h(0); - sad1_0 = (v8u16)__msa_ldi_h(0); - sad1_1 = (v8u16)__msa_ldi_h(0); - sad2_0 = (v8u16)__msa_ldi_h(0); - sad2_1 = (v8u16)__msa_ldi_h(0); - sad3_0 = (v8u16)__msa_ldi_h(0); - sad3_1 = (v8u16)__msa_ldi_h(0); - - for (ht_cnt = 64; ht_cnt--;) { - LD_UB4(src_dup, 16, src0, src1, src2, src3); - src_dup += src_stride; - LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref_dup += ref_stride; - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); - sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[4] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[5] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[6] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[7] = HADD_SW_S32(sad); -} - static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *const aref_ptr[], int32_t ref_stride, int32_t height, @@ -1030,6 +770,7 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, v8u16 sad2_1 = { 0 }; v8u16 sad3_0 = { 0 }; v8u16 sad3_1 = { 0 }; + v4u32 sad; ref0_ptr = aref_ptr[0]; ref1_ptr = aref_ptr[1]; @@ -1061,14 +802,21 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); } - sad_array[0] = HADD_UH_U32(sad0_0); - sad_array[0] += HADD_UH_U32(sad0_1); - sad_array[1] = HADD_UH_U32(sad1_0); - sad_array[1] += HADD_UH_U32(sad1_1); - sad_array[2] = HADD_UH_U32(sad2_0); - sad_array[2] += HADD_UH_U32(sad2_1); - sad_array[3] = HADD_UH_U32(sad3_0); - sad_array[3] += HADD_UH_U32(sad3_1); + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[3] = HADD_UW_U32(sad); } static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, @@ -1310,20 +1058,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ } -#define VPX_SAD_32xHEIGHTx3_MSA(height) \ - void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_64xHEIGHTx3_MSA(height) \ - void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - #define VPX_SAD_4xHEIGHTx8_MSA(height) \ void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ @@ -1345,20 +1079,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ } -#define VPX_SAD_32xHEIGHTx8_MSA(height) \ - void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_64xHEIGHTx8_MSA(height) \ - void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - #define VPX_SAD_4xHEIGHTx4D_MSA(height) \ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *const refs[], \ @@ -1436,43 +1156,31 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, // 64x64 VPX_SAD_64xHEIGHT_MSA(64); -VPX_SAD_64xHEIGHTx3_MSA(64); -VPX_SAD_64xHEIGHTx8_MSA(64); VPX_SAD_64xHEIGHTx4D_MSA(64); VPX_AVGSAD_64xHEIGHT_MSA(64); // 64x32 VPX_SAD_64xHEIGHT_MSA(32); -VPX_SAD_64xHEIGHTx3_MSA(32); -VPX_SAD_64xHEIGHTx8_MSA(32); VPX_SAD_64xHEIGHTx4D_MSA(32); VPX_AVGSAD_64xHEIGHT_MSA(32); // 32x64 VPX_SAD_32xHEIGHT_MSA(64); -VPX_SAD_32xHEIGHTx3_MSA(64); -VPX_SAD_32xHEIGHTx8_MSA(64); VPX_SAD_32xHEIGHTx4D_MSA(64); VPX_AVGSAD_32xHEIGHT_MSA(64); // 32x32 VPX_SAD_32xHEIGHT_MSA(32); -VPX_SAD_32xHEIGHTx3_MSA(32); -VPX_SAD_32xHEIGHTx8_MSA(32); VPX_SAD_32xHEIGHTx4D_MSA(32); VPX_AVGSAD_32xHEIGHT_MSA(32); // 32x16 VPX_SAD_32xHEIGHT_MSA(16); -VPX_SAD_32xHEIGHTx3_MSA(16); -VPX_SAD_32xHEIGHTx8_MSA(16); VPX_SAD_32xHEIGHTx4D_MSA(16); VPX_AVGSAD_32xHEIGHT_MSA(16); // 16x32 VPX_SAD_16xHEIGHT_MSA(32); -VPX_SAD_16xHEIGHTx3_MSA(32); -VPX_SAD_16xHEIGHTx8_MSA(32); VPX_SAD_16xHEIGHTx4D_MSA(32); VPX_AVGSAD_16xHEIGHT_MSA(32); @@ -1506,15 +1214,11 @@ VPX_AVGSAD_8xHEIGHT_MSA(8); // 8x4 VPX_SAD_8xHEIGHT_MSA(4); -VPX_SAD_8xHEIGHTx3_MSA(4); -VPX_SAD_8xHEIGHTx8_MSA(4); VPX_SAD_8xHEIGHTx4D_MSA(4); VPX_AVGSAD_8xHEIGHT_MSA(4); // 4x8 VPX_SAD_4xHEIGHT_MSA(8); -VPX_SAD_4xHEIGHTx3_MSA(8); -VPX_SAD_4xHEIGHTx8_MSA(8); VPX_SAD_4xHEIGHTx4D_MSA(8); VPX_AVGSAD_4xHEIGHT_MSA(8); diff --git a/libs/libvpx/vpx_dsp/mips/subtract_mmi.c b/libs/libvpx/vpx_dsp/mips/subtract_mmi.c new file mode 100644 index 0000000000..9f361704a8 --- /dev/null +++ b/libs/libvpx/vpx_dsp/mips/subtract_mmi.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + double ftmp[13]; + uint32_t tmp[1]; + + if (rows == cols) { + switch (rows) { + case 4: + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp2] \n\t" +#else + "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp2], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp2], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp3] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp4] \n\t" +#else + "gslwlc1 %[ftmp3], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp6], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp8], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred]) \n\t" +#endif + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp4], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp8], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), +#if _MIPS_SIM == _ABIO32 + [tmp0] "=&r"(tmp[0]), +#endif + [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff) + : [src_stride] "r"((mips_reg)src_stride), + [pred_stride] "r"((mips_reg)pred_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 8: + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x02 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp7], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src), + [pred] "+&r"(pred), [diff] "+&r"(diff) + : [pred_stride] "r"((mips_reg)pred_stride), + [src_stride] "r"((mips_reg)src_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 16: + __asm__ volatile( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x08 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t" + "gsldlc1 %[ftmp3], 0x0f(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x08(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x0f(%[pred]) \n\t" + "gsldrc1 %[ftmp4], 0x08(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t" + "gsldlc1 %[ftmp7], 0x0f(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldlc1 %[ftmp8], 0x0f(%[pred]) \n\t" + "gsldrc1 %[ftmp8], 0x08(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src), + [pred] "+&r"(pred), [diff] "+&r"(diff) + : [pred_stride] "r"((mips_reg)pred_stride), + [src_stride] "r"((mips_reg)src_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 32: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + case 64: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred, + pred_stride); + } +} diff --git a/libs/libvpx/vpx_dsp/mips/sum_squares_msa.c b/libs/libvpx/vpx_dsp/mips/sum_squares_msa.c new file mode 100644 index 0000000000..d4563dc410 --- /dev/null +++ b/libs/libvpx/vpx_dsp/mips/sum_squares_msa.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "./macros_msa.h" + +uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride, + int size) { + int row, col; + uint64_t ss_res = 0; + v4i32 mul0, mul1; + v2i64 res0 = { 0 }; + + if (4 == size) { + uint64_t src0, src1, src2, src3; + v8i16 diff0 = { 0 }; + v8i16 diff1 = { 0 }; + + LD4(src, src_stride, src0, src1, src2, src3); + INSERT_D2_SH(src0, src1, diff0); + INSERT_D2_SH(src2, src3, diff1); + DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1); + mul0 += mul1; + res0 = __msa_hadd_s_d(mul0, mul0); + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (8 == size) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 = __msa_hadd_s_d(mul0, mul0); + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (16 == size) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += 8 * src_stride; + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 += __msa_hadd_s_d(mul0, mul0); + + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (0 == (size % 16)) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + for (row = 0; row < (size >> 4); row++) { + for (col = 0; col < size; col += 16) { + const int16_t *src_ptr = src + col; + LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, + src6, src7); + src_ptr += 8 * src_stride; + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, + src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 += __msa_hadd_s_d(mul0, mul0); + } + + src += 16 * src_stride; + } + + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else { + int16_t val; + + for (row = 0; row < size; row++) { + for (col = 0; col < size; col++) { + val = src[col]; + ss_res += val * val; + } + + src += src_stride; + } + } + + return ss_res; +} diff --git a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h index da100f6a98..f077fa4814 100644 --- a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h +++ b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h @@ -15,19 +15,24 @@ #define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ { \ - v8i16 k0_m = __msa_fill_h(cnst0); \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + v8i16 k0_m, k1_m, k2_m, zero = { 0 }; \ \ - s0_m = (v4i32)__msa_fill_h(cnst1); \ - k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ + k0_m = __msa_fill_h(cnst0); \ + k1_m = __msa_fill_h(cnst1); \ + k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m); \ + k0_m = __msa_ilvev_h((v8i16)zero, k0_m); \ + k1_m = __msa_ilvev_h(k1_m, (v8i16)zero); \ \ - ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg1, reg0, s5_m, s4_m); \ ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ - DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m); \ + s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m); \ + s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m); \ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ \ - DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m); \ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ } diff --git a/libs/libvpx/vpx_dsp/mips/variance_mmi.c b/libs/libvpx/vpx_dsp/mips/variance_mmi.c new file mode 100644 index 0000000000..4af60d3634 --- /dev/null +++ b/libs/libvpx/vpx_dsp/mips/variance_mmi.c @@ -0,0 +1,1280 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/variance.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/asmdefs_mmi.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32, + vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */ +#define VARIANCE_SSE_SUM_8_FOR_W64 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ + "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \ + "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \ + "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ + "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \ + "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" + +#define VARIANCE_SSE_SUM_4 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \ + "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + +#define VARIANCE_SSE_SUM_8 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \ + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \ + "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t" + +#define VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VARIANCE_SSE_16 \ + VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \ + /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \ + /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ + \ + /* store: temp2[0] ~ temp2[3] */ \ + "and %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ + "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ + \ + /* store: temp2[0] ~ temp2[3] */ \ + "and %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ + "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ + /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ + "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ + "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ + /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \ + "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \ + "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \ + "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ + "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[4] ~ temp2[7] */ \ + "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \ + "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \ + \ + /* store: temp2[0] ~ temp2[7] */ \ + "and %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "and %[ftmp3], %[ftmp3], %[mask] \n\t" \ + "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ + "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \ + "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[4] ~ temp2[7] */ \ + "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \ + "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \ + \ + /* store: temp2[0] ~ temp2[7] */ \ + "and %[ftmp8], %[ftmp8], %[mask] \n\t" \ + "and %[ftmp9], %[ftmp9], %[mask] \n\t" \ + "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \ + /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ + \ + /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \ + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ + "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \ + "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ + "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \ + /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ + \ + /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \ + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ + "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \ + "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \ + "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \ + "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ + "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \ + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ + \ + /* calculate: temp2[8] ~ temp2[11] */ \ + "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[12] ~ temp2[15] */ \ + "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \ + \ + /* store: temp2[8] ~ temp2[15] */ \ + "and %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "and %[ftmp5], %[ftmp5], %[mask] \n\t" \ + "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \ + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ + \ + /* calculate: temp2[8] ~ temp2[11] */ \ + "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \ + "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[12] ~ temp2[15] */ \ + "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \ + "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \ + \ + /* store: temp2[8] ~ temp2[15] */ \ + "and %[ftmp10], %[ftmp10], %[mask] \n\t" \ + "and %[ftmp11], %[ftmp11], %[mask] \n\t" \ + "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t" + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x27(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x20(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x27(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x20(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x2f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x28(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x2f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x28(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x37(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x30(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x37(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x30(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x3f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x38(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x3f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x38(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "mfc1 %[tmp1], %[ftmp9] \n\t" + "mfhc1 %[tmp2], %[ftmp9] \n\t" + "addu %[sum], %[tmp1], %[tmp2] \n\t" + "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "swc1 %[ftmp1], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [a]"+&r"(a), [b]"+&r"(b), + [sum]"=&r"(sum) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (64 * high)); +} + +#define VPX_VARIANCE64XN(n) \ + uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance64x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE64XN(64) +VPX_VARIANCE64XN(32) + +uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, uint32_t *sse) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + "li %[tmp0], 0x40 \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "mfc1 %[tmp1], %[ftmp9] \n\t" + "mfhc1 %[tmp2], %[ftmp9] \n\t" + "addu %[sum], %[tmp1], %[tmp2] \n\t" + "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "swc1 %[ftmp1], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [a]"+&r"(a), [b]"+&r"(b), + [sum]"=&r"(sum) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [sse]"r"(sse) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / 2048); +} + +static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (32 * high)); +} + +#define VPX_VARIANCE32XN(n) \ + uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance32x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE32XN(32) +VPX_VARIANCE32XN(16) + +static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (16 * high)); +} + +#define VPX_VARIANCE16XN(n) \ + uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance16x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE16XN(32) +VPX_VARIANCE16XN(16) +VPX_VARIANCE16XN(8) + +static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (8 * high)); +} + +#define VPX_VARIANCE8XN(n) \ + uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance8x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE8XN(16) +VPX_VARIANCE8XN(8) +VPX_VARIANCE8XN(4) + +static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + VARIANCE_SSE_SUM_4 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "dsrl %[ftmp0], %[ftmp3], %[ftmp10] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + + return *sse - (((int64_t)sum * sum) / (4 * high)); +} + +#define VPX_VARIANCE4XN(n) \ + uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_variance4x(a, a_stride, b, b_stride, sse, n); \ + } + +VPX_VARIANCE4XN(8) +VPX_VARIANCE4XN(4) + +static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse, + uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + + "1: \n\t" + VARIANCE_SSE_16 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + + return *sse; +} + +#define vpx_mse16xN(n) \ + uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \ + } + +vpx_mse16xN(16); +vpx_mse16xN(8); + +static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse, + uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + + "1: \n\t" + VARIANCE_SSE_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + + return *sse; +} + +#define vpx_mse8xN(n) \ + uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \ + } + +vpx_mse8xN(16); +vpx_mse8xN(8); + +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ + \ + return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse); \ + } + +SUBPIX_VAR(64, 64) +SUBPIX_VAR(64, 32) +SUBPIX_VAR(32, 64) +SUBPIX_VAR(32, 32) +SUBPIX_VAR(32, 16) +SUBPIX_VAR(16, 32) + +static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + uint8_t *temp2, int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[15]; + mips_reg tmp[2]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + + const uint8_t *filter_x = bilinear_filters[xoffset]; + const uint8_t *filter_y = bilinear_filters[yoffset]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp14]) + "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" + "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" + "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" + "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + + // fdata3: fdata3[0] ~ fdata3[15] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + + // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B + // temp2: temp2[0] ~ temp2[15] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A + + // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + // temp2+16*1: temp2[0] ~ temp2[15] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B + + "1: \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A + + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), + [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), + [counter]"+&r"(l_counter) + : [filter_x0] "f"((uint64_t)filter_x[0]), + [filter_x1] "f"((uint64_t)filter_x[1]), + [filter_y0] "f"((uint64_t)filter_y[0]), + [filter_y1] "f"((uint64_t)filter_y[1]), + [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [mask] "f"(mask) + : "memory" + ); +} + +#define SUBPIX_VAR16XN(H) \ + uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint8_t temp2[16 * H]; \ + var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse); \ + } + +SUBPIX_VAR16XN(16) +SUBPIX_VAR16XN(8) + +static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + uint8_t *temp2, int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[15]; + mips_reg tmp[2]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + const uint8_t *filter_x = bilinear_filters[xoffset]; + const uint8_t *filter_y = bilinear_filters[yoffset]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp14]) + "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" + "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" + "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" + "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + + // fdata3: fdata3[0] ~ fdata3[7] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + + // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B + // temp2: temp2[0] ~ temp2[7] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A + + // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + // temp2+8*1: temp2[0] ~ temp2[7] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B + + "1: \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A + + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), + [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), + [counter]"+&r"(l_counter) + : [filter_x0] "f"((uint64_t)filter_x[0]), + [filter_x1] "f"((uint64_t)filter_x[1]), + [filter_y0] "f"((uint64_t)filter_y[0]), + [filter_y1] "f"((uint64_t)filter_y[1]), + [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [mask] "f"(mask) + : "memory" + ); +} + +#define SUBPIX_VAR8XN(H) \ + uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint8_t temp2[8 * H]; \ + var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse); \ + } + +SUBPIX_VAR8XN(16) +SUBPIX_VAR8XN(8) +SUBPIX_VAR8XN(4) + +static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + uint8_t *temp2, int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[7]; + mips_reg tmp[2]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; + const uint8_t *filter_x = bilinear_filters[xoffset]; + const uint8_t *filter_y = bilinear_filters[yoffset]; + + __asm__ volatile ( + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp6]) + "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t" + "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t" + "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t" + "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t" + // fdata3: fdata3[0] ~ fdata3[3] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + + // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B + // temp2: temp2[0] ~ temp2[7] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A + + // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + // temp2+4*1: temp2[0] ~ temp2[7] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B + + "1: \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A + + MMI_ADDU(%[a], %[a], %[a_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), + [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) + : [filter_x0] "f"((uint64_t)filter_x[0]), + [filter_x1] "f"((uint64_t)filter_x[1]), + [filter_y0] "f"((uint64_t)filter_y[0]), + [filter_y1] "f"((uint64_t)filter_y[1]), + [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [mask] "f"(mask) + : "memory" + ); +} + +#define SUBPIX_VAR4XN(H) \ + uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint8_t temp2[4 * H]; \ + var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse); \ + } + +SUBPIX_VAR4XN(8) +SUBPIX_VAR4XN(4) + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse); \ + } + +SUBPIX_AVG_VAR(64, 64) +SUBPIX_AVG_VAR(64, 32) +SUBPIX_AVG_VAR(32, 64) +SUBPIX_AVG_VAR(32, 32) +SUBPIX_AVG_VAR(32, 16) +SUBPIX_AVG_VAR(16, 32) +SUBPIX_AVG_VAR(16, 16) +SUBPIX_AVG_VAR(16, 8) +SUBPIX_AVG_VAR(8, 16) +SUBPIX_AVG_VAR(8, 8) +SUBPIX_AVG_VAR(8, 4) +SUBPIX_AVG_VAR(4, 8) +SUBPIX_AVG_VAR(4, 4) diff --git a/libs/libvpx/vpx_dsp/mips/variance_msa.c b/libs/libvpx/vpx_dsp/mips/variance_msa.c index 085990e484..49b2f99230 100644 --- a/libs/libvpx/vpx_dsp/mips/variance_msa.c +++ b/libs/libvpx/vpx_dsp/mips/variance_msa.c @@ -489,27 +489,19 @@ static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride) { - uint32_t err = 0; uint32_t src0, src1, src2, src3; uint32_t ref0, ref1, ref2, ref3; v16i8 src = { 0 }; v16i8 ref = { 0 }; - v16u8 src_vec0, src_vec1; - v8i16 diff0, diff1; v4i32 err0 = { 0 }; - v4i32 err1 = { 0 }; LW4(src_ptr, src_stride, src0, src1, src2, src3); LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); INSERT_W4_SB(src0, src1, src2, src3, src); INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); - ILVRL_B2_UB(src, ref, src_vec0, src_vec1); - HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1); - DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1); - err = HADD_SW_S32(err0); - err += HADD_SW_S32(err1); + CALC_MSE_B(src, ref, err0); - return err; + return HADD_SW_S32(err0); } #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c index ad2af28669..187a013421 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c @@ -16,8 +16,9 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst0, dst1, dst2, dst3, res2, res3; + v16u8 dst0 = { 0 }, res; v16u8 mask0, mask1, mask2, mask3; v8i16 filt, res0, res1; @@ -36,23 +37,23 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, res0, res1); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); SRARI_H2_SH(res0, res1, FILTER_BITS); SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - XORI_B2_128_UB(res2, res3); - AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + res = PCKEV_XORI128_UB(res0, res1); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8i16 filt, vec0, vec1, vec2, vec3; mask0 = LD_UB(&mc_filt_mask_arr[16]); @@ -69,7 +70,10 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, LD_SB4(src, src_stride, src0, src1, src2, src3); XORI_B4_128_SB(src0, src1, src2, src3); src += (4 * src_stride); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, vec0, vec1); LD_SB4(src, src_stride, src0, src1, src2, src3); @@ -82,10 +86,7 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, res3); ILVR_D2_UB(res1, res0, res3, res2, res0, res2); XORI_B2_128_UB(res0, res2); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); - AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); + AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); ST4x8_UB(res0, res2, dst, dst_stride); } @@ -105,8 +106,9 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { int32_t loop_cnt; + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 }; v8i16 filt, out0, out1, out2, out3; mask0 = LD_UB(&mc_filt_mask_arr[0]); @@ -127,10 +129,12 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); } @@ -309,8 +313,9 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; + v16u8 filt0, dst0 = { 0 }, vec0, vec1, res; v8u16 vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -320,23 +325,24 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); SRARI_H2_UH(vec2, vec3, FILTER_BITS); - PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8u16 vec4, vec5, vec6, vec7, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -346,7 +352,10 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, @@ -354,13 +363,9 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, res3); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); } static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, @@ -378,8 +383,9 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; v8u16 vec0, vec1, vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[0]); @@ -394,16 +400,18 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); } static void common_hz_2t_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; v8u16 vec0, vec1, vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[0]); @@ -419,11 +427,12 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -431,9 +440,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); if (16 == height) { @@ -445,10 +455,11 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); LD_SB4(src, src_stride, src0, src1, src2, src3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -456,9 +467,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); } } @@ -633,9 +645,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -668,8 +681,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, &filt_hor[3], h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -695,8 +708,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filt_hor, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c index 1cfa63201c..5187cea21c 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c @@ -16,8 +16,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; + v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; @@ -59,7 +60,8 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( XORI_B4_128_SB(src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); @@ -73,14 +75,12 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); SRARI_H2_SH(res0, res1, FILTER_BITS); SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); - XORI_B2_128_UB(tmp0, tmp1); - AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); - ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + res = PCKEV_XORI128_UB(res0, res1); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); hz_out5 = hz_out9; @@ -94,10 +94,11 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; + v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; @@ -144,7 +145,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( XORI_B4_128_SB(src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); @@ -172,7 +175,7 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); - CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); @@ -225,9 +228,10 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa( static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 dst0, dst1, dst2, dst3, res0, res1; + v16u8 dst0 = { 0 }, out; v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -248,21 +252,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -289,21 +294,18 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( hz_out3, hz_out5, 8); hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, - res3); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST4x8_UB(res0, res1, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_4w_msa( @@ -321,8 +323,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa( static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -338,7 +341,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( LD_SB5(src, src_stride, src0, src1, src2, src3, src4); src += (5 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); @@ -357,16 +362,16 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( tmp3 = __msa_dotp_u_h(vec3, filt_vt); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; + v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 }; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -407,9 +412,10 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( tmp3 = __msa_dotp_u_h(vec0, filt_vt); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -516,9 +522,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa( void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -560,14 +567,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], &filt_ver[3], h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { switch (w) { case 4: @@ -596,8 +603,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c index 146ce3b2f5..ef8c901140 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c @@ -17,8 +17,9 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, out; + v16u8 dst0 = { 0 }, out; v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; v16i8 src10998, filt0, filt1, filt2, filt3; @@ -43,7 +44,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, LD_SB4(src, src_stride, src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); @@ -55,9 +57,6 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, SRARI_H2_SH(out10, out32, FILTER_BITS); SAT_SH2_SH(out10, out32, 7); out = PCKEV_XORI128_UB(out10, out32); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); out = __msa_aver_u_b(out, dst0); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -75,8 +74,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; v8i16 filt, out0, out1, out2, out3; @@ -98,7 +98,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, LD_SB4(src, src_stride, src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); XORI_B4_128_SB(src7, src8, src9, src10); ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); @@ -112,7 +114,7 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, filt1, filt2, filt3); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); @@ -246,8 +248,9 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + v16u8 dst0 = { 0 }, out, filt0, src2110, src4332; v16i8 src10_r, src32_r, src21_r, src43_r; v8i16 filt; v8u16 tmp0, tmp1; @@ -261,9 +264,8 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, src4 = LD_SB(src); src += src_stride; - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, src32_r, src43_r); ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); @@ -280,7 +282,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + uint32_t tp0, tp1, tp2, tp3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; v16u8 src2110, src4332, src6554, src8776, filt0; @@ -294,10 +297,10 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, src += (8 * src_stride); src8 = LD_SB(src); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, - dst3); - ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, src32_r, src43_r); ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, @@ -309,9 +312,7 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); + ST4x8_UB(src2110, src4332, dst, dst_stride); } static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, @@ -329,8 +330,9 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + int64_t tp0, tp1, tp2, tp3; v16u8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0; v8u16 tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -339,22 +341,24 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h(filt, 0); LD_UB5(src, src_stride, src0, src1, src2, src3, src4); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); } static void common_vt_2t_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + int64_t tp0, tp1, tp2, tp3; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; v8u16 tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -369,7 +373,12 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa( for (loop_cnt = (height >> 3); loop_cnt--;) { LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); src += (8 * src_stride); - LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst2); + INSERT_D2_UB(tp2, tp3, dst3); ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, vec3); @@ -378,15 +387,13 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride); dst += (4 * dst_stride); src0 = src8; @@ -605,9 +612,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -640,8 +648,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, &filt_ver[3], h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -668,8 +676,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filt_ver, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c index 9e8bf7b519..152dc26104 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c @@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor, h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c index b16ec57886..d35a5a7a63 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c @@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, } void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int32_t x_step_q4, const int16_t *filter_y, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, &filt_ver[3], (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } else { switch (w) { case 4: @@ -621,9 +623,605 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } } + +static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + uint32_t res; + v16u8 src0 = { 0 }, src1 = { 0 }, dst0; + v16i8 out0, out1; + v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 }; + v16i8 shf2 = shf1 + 2; + v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; + v16i8 filt_shf1 = filt_shf0 + 2; + v16i8 filt_shf2 = filt_shf0 + 4; + v16i8 filt_shf3 = filt_shf0 + 6; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); + XORI_B2_128_SB(out0, out1); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + + filt = LD_SH(x_filter); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); + + src0_h *= filt0; + src0_h += src1_h * filt1; + src0_h += src2_h * filt2; + src0_h += src3_h * filt3; + + src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); + + src0_h = __msa_adds_s_h(src0_h, src1_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + res = __msa_copy_u_w((v4i32)dst0, 0); + SW(res, dst); +} + +static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v16u8 tmp0, tmp1, tmp2, tmp3, dst0; + v16i8 out0, out1, out2, out3; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src2); + INSERT_D2_UB(srcd2, srcd3, src3); + + filt = LD_SH(x_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + // transpose + VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out0, out1); + ILVRL_W2_SB(tmp3, tmp1, out2, out3); + + XORI_B4_128_SB(out0, out1, out2, out3); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + UNPCK_SB_SH(out2, src4_h, src5_h); + UNPCK_SB_SH(out3, src6_h, src7_h); + + src0_h *= filt0; + src4_h *= filt4; + src0_h += src1_h * filt1; + src4_h += src5_h * filt5; + src0_h += src2_h * filt2; + src4_h += src6_h * filt6; + src0_h += src3_h * filt3; + src4_h += src7_h * filt7; + + src0_h = __msa_adds_s_h(src0_h, src4_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + ST8x1_UB(dst0, dst); +} + +static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 }; + v16u8 tmp0, tmp1, tmp2, tmp3, dst0; + v16i8 out0, out1, out2, out3, out4, out5, out6, out7; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + v8i16 dst0_h, dst1_h, dst2_h, dst3_h; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src2); + INSERT_D2_UB(srcd2, srcd3, src3); + LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src4); + INSERT_D2_UB(srcd2, srcd3, src5); + LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src6); + INSERT_D2_UB(srcd2, srcd3, src7); + + filt = LD_SH(x_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + // transpose + VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out0, out1); + ILVRL_W2_SB(tmp3, tmp1, out2, out3); + XORI_B4_128_SB(out0, out1, out2, out3); + + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + UNPCK_SB_SH(out2, src4_h, src5_h); + UNPCK_SB_SH(out3, src6_h, src7_h); + + VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out4, out5); + ILVRL_W2_SB(tmp3, tmp1, out6, out7); + XORI_B4_128_SB(out4, out5, out6, out7); + + dst0_h = src0_h * filt0; + dst1_h = src4_h * filt4; + dst0_h += src1_h * filt1; + dst1_h += src5_h * filt5; + dst0_h += src2_h * filt2; + dst1_h += src6_h * filt6; + dst0_h += src3_h * filt3; + dst1_h += src7_h * filt7; + + UNPCK_SB_SH(out4, src0_h, src1_h); + UNPCK_SB_SH(out5, src2_h, src3_h); + UNPCK_SB_SH(out6, src4_h, src5_h); + UNPCK_SB_SH(out7, src6_h, src7_h); + + dst2_h = src0_h * filt0; + dst3_h = src4_h * filt4; + dst2_h += src1_h * filt1; + dst3_h += src5_h * filt5; + dst2_h += src2_h * filt2; + dst3_h += src6_h * filt6; + dst2_h += src3_h * filt3; + dst3_h += src7_h * filt7; + + ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h); + SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS); + SAT_SH2_SH(dst0_h, dst2_h, 7); + dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h); + ST_UB(dst0, dst); +} + +static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0; + v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + + in0 = LD_UB(src); + out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0); + ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); +} + +static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + + LD_UB4(src, 16, in0, in1, in2, in3); + VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_UB(tmp2, tmp0, out0, out1); + ILVRL_W2_UB(tmp3, tmp1, out2, out3); + ST8x4_UB(out0, out1, dst, dst_stride); + ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride); +} + +static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12; + v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8; + v16u8 out9, out10, out11, out12, out13, out14, out15; + + LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15); + + TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, out0, out1, out2, out3, + out4, out5, out6, out7); + ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride); + dst += 8 * dst_stride; + + SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8); + SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8); + SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8); + SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8); + + TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, out8, out9, out10, out11, + out12, out13, out14, out15); + ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int y, z, i; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter); + } else { + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + + x_q4 += x_step_q4; + } + + transpose4x4_to_dst(temp, dst, dst_stride); + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int y, z, i; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter); + } else { + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[3 + i * src_stride]; + } + } + + x_q4 += x_step_q4; + } + + transpose8x8_to_dst(temp, dst, dst_stride); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]); + int x, y, z, i; + + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 16x16 areas. The intermediate height is not always + // a multiple of 16, so force it to be a multiple of 8 here. + y = h + (16 - (h & 0xF)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 16) { + for (z = 0; z < 16; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter); + } else { + for (i = 0; i < 16; ++i) { + temp[z * 16 + i] = src_x[3 + i * src_stride]; + } + } + + x_q4 += x_step_q4; + } + + transpose16x16_to_dst(temp, dst + x, dst_stride); + } + + src += src_stride * 16; + dst += dst_stride * 16; + } while (y -= 16); +} + +static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter) { + uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7; + uint32_t res; + v16u8 src0 = { 0 }, src1 = { 0 }, dst0; + v16i8 out0, out1; + v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; + v16i8 shf2 = shf1 + 8; + v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; + v16i8 filt_shf1 = filt_shf0 + 2; + v16i8 filt_shf2 = filt_shf0 + 4; + v16i8 filt_shf3 = filt_shf0 + 6; + v8i16 filt, src0_h, src1_h, src2_h, src3_h; + v8i16 filt0, filt1, filt2, filt3; + + LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3); + LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7); + INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0); + INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1); + VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); + XORI_B2_128_SB(out0, out1); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + + filt = LD_SH(y_filter); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); + + src0_h *= filt0; + src0_h += src1_h * filt1; + src0_h += src2_h * filt2; + src0_h += src3_h * filt3; + + src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); + + src0_h = __msa_adds_s_h(src0_h, src1_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + res = __msa_copy_u_w((v4i32)dst0, 0); + SW(res, dst); +} + +static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 dst0; + v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_SB(srcd0, srcd1, src0); + INSERT_D2_SB(srcd2, srcd3, src1); + LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_SB(srcd0, srcd1, src2); + INSERT_D2_SB(srcd2, srcd3, src3); + + filt = LD_SH(y_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + XORI_B4_128_SB(src0, src1, src2, src3); + UNPCK_SB_SH(src0, src0_h, src1_h); + UNPCK_SB_SH(src1, src2_h, src3_h); + UNPCK_SB_SH(src2, src4_h, src5_h); + UNPCK_SB_SH(src3, src6_h, src7_h); + + src0_h *= filt0; + src4_h *= filt4; + src0_h += src1_h * filt1; + src4_h += src5_h * filt5; + src0_h += src2_h * filt2; + src4_h += src6_h * filt6; + src0_h += src3_h * filt3; + src4_h += src7_h * filt7; + + src0_h = __msa_adds_s_h(src0_h, src4_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + ST8x1_UB(dst0, dst); +} + +static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter, + int w) { + int x; + v16u8 dst0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + filt = LD_SH(y_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + for (x = 0; x < w; x += 16) { + LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7); + src_y += 16; + + XORI_B4_128_SB(src0, src1, src2, src3); + XORI_B4_128_SB(src4, src5, src6, src7); + UNPCK_SB_SH(src0, src0_h, src1_h); + UNPCK_SB_SH(src1, src2_h, src3_h); + UNPCK_SB_SH(src2, src4_h, src5_h); + UNPCK_SB_SH(src3, src6_h, src7_h); + UNPCK_SB_SH(src4, src8_h, src9_h); + UNPCK_SB_SH(src5, src10_h, src11_h); + UNPCK_SB_SH(src6, src12_h, src13_h); + UNPCK_SB_SH(src7, src14_h, src15_h); + + src0_h *= filt0; + src1_h *= filt0; + src8_h *= filt4; + src9_h *= filt4; + src0_h += src2_h * filt1; + src1_h += src3_h * filt1; + src8_h += src10_h * filt5; + src9_h += src11_h * filt5; + src0_h += src4_h * filt2; + src1_h += src5_h * filt2; + src8_h += src12_h * filt6; + src9_h += src13_h * filt6; + src0_h += src6_h * filt3; + src1_h += src7_h * filt3; + src8_h += src14_h * filt7; + src9_h += src15_h * filt7; + + ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h); + SRARI_H2_SH(src0_h, src1_h, FILTER_BITS); + SAT_SH2_SH(src0_h, src1_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src1_h); + ST_UB(dst0, dst); + dst += 16; + } +} + +static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + uint32_t srcd = LW(src_y + 3 * src_stride); + SW(srcd, dst + y * dst_stride); + } + + y_q4 += y_step_q4; + } +} + +static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + uint64_t srcd = LD(src_y + 3 * src_stride); + SD(srcd, dst + y * dst_stride); + } + + y_q4 += y_step_q4; + } +} + +static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + int y_q4 = y0_q4; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + for (x = 0; x < w; ++x) { + dst[x + y * dst_stride] = src_y[x + 3 * src_stride]; + } + } + + y_q4 += y_step_q4; + } +} + +void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) { + vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + if (w >= 16) { + scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + w, intermediate_height); + } else if (w == 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, h); + } + } +} diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c index 410682271f..13fce0077c 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c @@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_ver[3], h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c index 45399bad85..ce649935da 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, @@ -188,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 4: { diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c index c3d87a4ab8..c2ab33a2f4 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c @@ -9,6 +9,7 @@ */ #include +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" static void copy_width8_msa(const uint8_t *src, int32_t src_stride, @@ -198,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 4: { diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h index 198c21ed20..d53244596b 100644 --- a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h +++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h @@ -16,18 +16,18 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; -#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ - filt3) \ - ({ \ - v8i16 tmp0, tmp1; \ - \ - tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ - tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ - tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \ - tmp0 = __msa_adds_s_h(tmp0, tmp1); \ - \ - tmp0; \ +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp_dpadd_0, tmp_dpadd_1; \ + \ + tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ + tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ + tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ + \ + tmp_dpadd_0; \ }) #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ @@ -110,15 +110,13 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; ST_UB(tmp_m, (pdst)); \ } -#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ - stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - \ - PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ - PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ +#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } #endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ diff --git a/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h new file mode 100644 index 0000000000..2c5d9a4f6a --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ +#define VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + int32x4_t u = vec_vsx_ld(c, s); + int32x4_t v = vec_vsx_ld(c, s + 4); + return vec_packs(u, v); +#else + return vec_vsx_ld(c, s); +#endif +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + const int16x8_t one = vec_splat_s16(1); + const int32x4_t even = vec_mule(v, one); + const int32x4_t odd = vec_mulo(v, one); + const int32x4_t high = vec_mergeh(even, odd); + const int32x4_t low = vec_mergel(even, odd); + vec_vsx_st(high, c, s); + vec_vsx_st(low, c, s + 4); +#else + vec_vsx_st(v, c, s); +#endif +} + +#endif // VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/libs/libvpx/vpx_dsp/ppc/hadamard_vsx.c new file mode 100644 index 0000000000..e279b30478 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/hadamard_vsx.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/ppc/transpose_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) { + const int16x8_t b0 = vec_add(v[0], v[1]); + const int16x8_t b1 = vec_sub(v[0], v[1]); + const int16x8_t b2 = vec_add(v[2], v[3]); + const int16x8_t b3 = vec_sub(v[2], v[3]); + const int16x8_t b4 = vec_add(v[4], v[5]); + const int16x8_t b5 = vec_sub(v[4], v[5]); + const int16x8_t b6 = vec_add(v[6], v[7]); + const int16x8_t b7 = vec_sub(v[6], v[7]); + + const int16x8_t c0 = vec_add(b0, b2); + const int16x8_t c1 = vec_add(b1, b3); + const int16x8_t c2 = vec_sub(b0, b2); + const int16x8_t c3 = vec_sub(b1, b3); + const int16x8_t c4 = vec_add(b4, b6); + const int16x8_t c5 = vec_add(b5, b7); + const int16x8_t c6 = vec_sub(b4, b6); + const int16x8_t c7 = vec_sub(b5, b7); + + v[0] = vec_add(c0, c4); + v[1] = vec_sub(c2, c6); + v[2] = vec_sub(c0, c4); + v[3] = vec_add(c2, c6); + v[4] = vec_add(c3, c7); + v[5] = vec_sub(c3, c7); + v[6] = vec_sub(c1, c5); + v[7] = vec_add(c1, c5); +} + +void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x8_t v[8]; + + v[0] = vec_vsx_ld(0, src_diff); + v[1] = vec_vsx_ld(0, src_diff + src_stride); + v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride)); + v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride)); + v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride)); + v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride)); + v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride)); + v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride)); + + vpx_hadamard_s16_8x8_one_pass(v); + + vpx_transpose_s16_8x8(v); + + vpx_hadamard_s16_8x8_one_pass(v); + + store_tran_low(v[0], 0, coeff); + store_tran_low(v[1], 0, coeff + 8); + store_tran_low(v[2], 0, coeff + 16); + store_tran_low(v[3], 0, coeff + 24); + store_tran_low(v[4], 0, coeff + 32); + store_tran_low(v[5], 0, coeff + 40); + store_tran_low(v[6], 0, coeff + 48); + store_tran_low(v[7], 0, coeff + 56); +} + +void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + const uint16x8_t ones = vec_splat_u16(1); + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff); + /* Top right. */ + vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + /* Overlay the 8x8 blocks and combine. */ + for (i = 0; i < 64; i += 8) { + const int16x8_t a0 = load_tran_low(0, coeff); + const int16x8_t a1 = load_tran_low(0, coeff + 64); + const int16x8_t a2 = load_tran_low(0, coeff + 128); + const int16x8_t a3 = load_tran_low(0, coeff + 192); + + /* Prevent the result from escaping int16_t. */ + const int16x8_t b0 = vec_sra(a0, ones); + const int16x8_t b1 = vec_sra(a1, ones); + const int16x8_t b2 = vec_sra(a2, ones); + const int16x8_t b3 = vec_sra(a3, ones); + + const int16x8_t c0 = vec_add(b0, b1); + const int16x8_t c2 = vec_add(b2, b3); + const int16x8_t c1 = vec_sub(b0, b1); + const int16x8_t c3 = vec_sub(b2, b3); + + const int16x8_t d0 = vec_add(c0, c2); + const int16x8_t d1 = vec_add(c1, c3); + const int16x8_t d2 = vec_sub(c0, c2); + const int16x8_t d3 = vec_sub(c1, c3); + + store_tran_low(d0, 0, coeff); + store_tran_low(d1, 0, coeff + 64); + store_tran_low(d2, 0, coeff + 128); + store_tran_low(d3, 0, coeff + 192); + + coeff += 8; + } +} diff --git a/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c new file mode 100644 index 0000000000..6273460f19 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c @@ -0,0 +1,749 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, above); + int i; + (void)left; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(d, 0, dst); + } +} + +void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, above); + const uint8x16_t d1 = vec_vsx_ld(16, above); + int i; + (void)left; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(d0, 0, dst); + vec_vsx_st(d1, 16, dst); + } +} + +static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + +void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + (void)above; + + vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); +} + +void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + (void)above; + + vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst); +} + +void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + const uint8x16_t v8 = vec_splat(d, 8); + const uint8x16_t v9 = vec_splat(d, 9); + const uint8x16_t v10 = vec_splat(d, 10); + const uint8x16_t v11 = vec_splat(d, 11); + + const uint8x16_t v12 = vec_splat(d, 12); + const uint8x16_t v13 = vec_splat(d, 13); + const uint8x16_t v14 = vec_splat(d, 14); + const uint8x16_t v15 = vec_splat(d, 15); + + (void)above; + + vec_vsx_st(v0, 0, dst); + dst += stride; + vec_vsx_st(v1, 0, dst); + dst += stride; + vec_vsx_st(v2, 0, dst); + dst += stride; + vec_vsx_st(v3, 0, dst); + dst += stride; + vec_vsx_st(v4, 0, dst); + dst += stride; + vec_vsx_st(v5, 0, dst); + dst += stride; + vec_vsx_st(v6, 0, dst); + dst += stride; + vec_vsx_st(v7, 0, dst); + dst += stride; + vec_vsx_st(v8, 0, dst); + dst += stride; + vec_vsx_st(v9, 0, dst); + dst += stride; + vec_vsx_st(v10, 0, dst); + dst += stride; + vec_vsx_st(v11, 0, dst); + dst += stride; + vec_vsx_st(v12, 0, dst); + dst += stride; + vec_vsx_st(v13, 0, dst); + dst += stride; + vec_vsx_st(v14, 0, dst); + dst += stride; + vec_vsx_st(v15, 0, dst); +} + +#define H_PREDICTOR_32(v) \ + vec_vsx_st(v, 0, dst); \ + vec_vsx_st(v, 16, dst); \ + dst += stride + +void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, left); + const uint8x16_t d1 = vec_vsx_ld(16, left); + + const uint8x16_t v0_0 = vec_splat(d0, 0); + const uint8x16_t v1_0 = vec_splat(d0, 1); + const uint8x16_t v2_0 = vec_splat(d0, 2); + const uint8x16_t v3_0 = vec_splat(d0, 3); + const uint8x16_t v4_0 = vec_splat(d0, 4); + const uint8x16_t v5_0 = vec_splat(d0, 5); + const uint8x16_t v6_0 = vec_splat(d0, 6); + const uint8x16_t v7_0 = vec_splat(d0, 7); + const uint8x16_t v8_0 = vec_splat(d0, 8); + const uint8x16_t v9_0 = vec_splat(d0, 9); + const uint8x16_t v10_0 = vec_splat(d0, 10); + const uint8x16_t v11_0 = vec_splat(d0, 11); + const uint8x16_t v12_0 = vec_splat(d0, 12); + const uint8x16_t v13_0 = vec_splat(d0, 13); + const uint8x16_t v14_0 = vec_splat(d0, 14); + const uint8x16_t v15_0 = vec_splat(d0, 15); + + const uint8x16_t v0_1 = vec_splat(d1, 0); + const uint8x16_t v1_1 = vec_splat(d1, 1); + const uint8x16_t v2_1 = vec_splat(d1, 2); + const uint8x16_t v3_1 = vec_splat(d1, 3); + const uint8x16_t v4_1 = vec_splat(d1, 4); + const uint8x16_t v5_1 = vec_splat(d1, 5); + const uint8x16_t v6_1 = vec_splat(d1, 6); + const uint8x16_t v7_1 = vec_splat(d1, 7); + const uint8x16_t v8_1 = vec_splat(d1, 8); + const uint8x16_t v9_1 = vec_splat(d1, 9); + const uint8x16_t v10_1 = vec_splat(d1, 10); + const uint8x16_t v11_1 = vec_splat(d1, 11); + const uint8x16_t v12_1 = vec_splat(d1, 12); + const uint8x16_t v13_1 = vec_splat(d1, 13); + const uint8x16_t v14_1 = vec_splat(d1, 14); + const uint8x16_t v15_1 = vec_splat(d1, 15); + + (void)above; + + H_PREDICTOR_32(v0_0); + H_PREDICTOR_32(v1_0); + H_PREDICTOR_32(v2_0); + H_PREDICTOR_32(v3_0); + + H_PREDICTOR_32(v4_0); + H_PREDICTOR_32(v5_0); + H_PREDICTOR_32(v6_0); + H_PREDICTOR_32(v7_0); + + H_PREDICTOR_32(v8_0); + H_PREDICTOR_32(v9_0); + H_PREDICTOR_32(v10_0); + H_PREDICTOR_32(v11_0); + + H_PREDICTOR_32(v12_0); + H_PREDICTOR_32(v13_0); + H_PREDICTOR_32(v14_0); + H_PREDICTOR_32(v15_0); + + H_PREDICTOR_32(v0_1); + H_PREDICTOR_32(v1_1); + H_PREDICTOR_32(v2_1); + H_PREDICTOR_32(v3_1); + + H_PREDICTOR_32(v4_1); + H_PREDICTOR_32(v5_1); + H_PREDICTOR_32(v6_1); + H_PREDICTOR_32(v7_1); + + H_PREDICTOR_32(v8_1); + H_PREDICTOR_32(v9_1); + H_PREDICTOR_32(v10_1); + H_PREDICTOR_32(v11_1); + + H_PREDICTOR_32(v12_1); + H_PREDICTOR_32(v13_1); + H_PREDICTOR_32(v14_1); + H_PREDICTOR_32(v15_1); +} + +void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + uint8x16_t d; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); +} + +void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 4), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 5), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 6), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 7), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); +} + +static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l, + int16x8_t ah, int16x8_t al, int16x8_t tl) { + int16x8_t vh, vl, ls; + + ls = vec_splat(l, 0); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 1); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 2); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 3); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 4); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 5); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 6); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 7); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); +} + +void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l = vec_vsx_ld(0, left); + const int16x8_t lh = unpack_to_s16_h(l); + const int16x8_t ll = unpack_to_s16_l(l); + const uint8x16_t a = vec_vsx_ld(0, above); + const int16x8_t ah = unpack_to_s16_h(a); + const int16x8_t al = unpack_to_s16_l(a); + + tm_predictor_16x8(dst, stride, lh, ah, al, tl); + + dst += stride * 8; + + tm_predictor_16x8(dst, stride, ll, ah, al, tl); +} + +static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls, + const int16x8_t a0h, const int16x8_t a0l, + const int16x8_t a1h, const int16x8_t a1l, + const int16x8_t tl) { + int16x8_t vh, vl; + + vh = vec_sub(vec_add(ls, a0h), tl); + vl = vec_sub(vec_add(ls, a0l), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + vh = vec_sub(vec_add(ls, a1h), tl); + vl = vec_sub(vec_add(ls, a1l), tl); + vec_vsx_st(vec_packsu(vh, vl), 16, dst); +} + +static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride, + const int16x8_t l, const uint8x16_t a0, + const uint8x16_t a1, const int16x8_t tl) { + const int16x8_t a0h = unpack_to_s16_h(a0); + const int16x8_t a0l = unpack_to_s16_l(a0); + const int16x8_t a1h = unpack_to_s16_h(a1); + const int16x8_t a1l = unpack_to_s16_l(a1); + + tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl); +} + +void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const uint8x16_t l1 = vec_vsx_ld(16, left); + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + + tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl); +} + +static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 8; i++, dst += stride) { + const uint8x16_t d = vec_vsx_ld(0, dst); + vec_vsx_st(xxpermdi(val, d, 1), 0, dst); + } +} + +static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(val, 0, dst); + } +} + +void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); + (void)above; + (void)left; + + dc_fill_predictor_16x16(dst, stride, v128); +} + +static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(val, 0, dst); + vec_vsx_st(val, 16, dst); + } +} + +void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); + (void)above; + (void)left; + + dc_fill_predictor_32x32(dst, stride, v128); +} + +static uint8x16_t avg16(const uint8_t *values) { + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0)); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8)); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + dc_fill_predictor_16x16(dst, stride, avg16(left)); +} + +void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + dc_fill_predictor_16x16(dst, stride, avg16(above)); +} + +static uint8x16_t avg32(const uint8_t *values) { + const uint8x16_t v0 = vec_vsx_ld(0, values); + const uint8x16_t v1 = vec_vsx_ld(16, values); + const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0))); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + dc_fill_predictor_32x32(dst, stride, avg32(left)); +} + +void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + dc_fill_predictor_32x32(dst, stride, avg32(above)); +} + +static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); + const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8)); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left)); +} + +void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left)); +} + +static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const uint8x16_t l1 = vec_vsx_ld(16, left); + const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5)); + const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0))); + const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum)); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left)); +} + +static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b, + const uint8x16_t c) { + const uint8x16_t ac = + vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1))); + + return vec_avg(ac, b); +} + +// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken. +static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, + 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 }; + +void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t af = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(af, 7); + const uint8x16_t a = xxpermdi(af, above_right, 1); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 8; i++) { + const uint8x16_t d = vec_vsx_ld(0, dst); + vec_vsx_st(xxpermdi(row, d, 1), 0, dst); + dst += stride; + row = vec_perm(row, above_right, sl1); + } +} + +void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(a, 15); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 16; i++) { + vec_vsx_st(row, 0, dst); + dst += stride; + row = vec_perm(row, above_right, sl1); + } +} + +void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t above_right = vec_splat(a1, 15); + const uint8x16_t b0 = vec_perm(a0, a1, sl1); + const uint8x16_t b1 = vec_perm(a1, above_right, sl1); + const uint8x16_t c0 = vec_perm(b0, b1, sl1); + const uint8x16_t c1 = vec_perm(b1, above_right, sl1); + uint8x16_t row0 = avg3(a0, b0, c0); + uint8x16_t row1 = avg3(a1, b1, c1); + int i; + (void)left; + + for (i = 0; i < 32; i++) { + vec_vsx_st(row0, 0, dst); + vec_vsx_st(row1, 16, dst); + dst += stride; + row0 = vec_perm(row0, row1, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t af = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(af, 9); + const uint8x16_t a = xxpermdi(af, above_right, 1); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row0 = vec_avg(a, b); + uint8x16_t row1 = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 4; i++) { + const uint8x16_t d0 = vec_vsx_ld(0, dst); + const uint8x16_t d1 = vec_vsx_ld(0, dst + stride); + vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst); + vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride); + dst += stride * 2; + row0 = vec_perm(row0, above_right, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t above_right = vec_splat(a1, 0); + const uint8x16_t b = vec_perm(a0, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row0 = vec_avg(a0, b); + uint8x16_t row1 = avg3(a0, b, c); + int i; + (void)left; + + for (i = 0; i < 8; i++) { + vec_vsx_st(row0, 0, dst); + vec_vsx_st(row1, 0, dst + stride); + dst += stride * 2; + row0 = vec_perm(row0, above_right, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t a2 = vec_vsx_ld(32, above); + const uint8x16_t above_right = vec_splat(a2, 0); + const uint8x16_t b0 = vec_perm(a0, a1, sl1); + const uint8x16_t b1 = vec_perm(a1, above_right, sl1); + const uint8x16_t c0 = vec_perm(b0, b1, sl1); + const uint8x16_t c1 = vec_perm(b1, above_right, sl1); + uint8x16_t row0_0 = vec_avg(a0, b0); + uint8x16_t row0_1 = vec_avg(a1, b1); + uint8x16_t row1_0 = avg3(a0, b0, c0); + uint8x16_t row1_1 = avg3(a1, b1, c1); + int i; + (void)left; + + for (i = 0; i < 16; i++) { + vec_vsx_st(row0_0, 0, dst); + vec_vsx_st(row0_1, 16, dst); + vec_vsx_st(row1_0, 0, dst + stride); + vec_vsx_st(row1_1, 16, dst + stride); + dst += stride * 2; + row0_0 = vec_perm(row0_0, row0_1, sl1); + row0_1 = vec_perm(row0_1, above_right, sl1); + row1_0 = vec_perm(row1_0, row1_1, sl1); + row1_1 = vec_perm(row1_1, above_right, sl1); + } +} diff --git a/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c new file mode 100644 index 0000000000..d43a9fd184 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c @@ -0,0 +1,1063 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" +#include "vpx_dsp/ppc/types_vsx.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/inv_txfm.h" + +static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 }; +static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 }; +static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 }; +static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 }; +static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 }; +static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 }; +static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270, + -6270, -6270, -6270, -6270 }; +static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 }; +static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 }; +static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 }; +static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 }; +static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 }; +static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 }; +static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +#define ROUND_SHIFT_INIT \ + const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ + const uint32x4_t shift14 = vec_splat_u32(14); + +#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14); + +#define PIXEL_ADD_INIT \ + int16x8_t add8 = vec_splat_s16(8); \ + uint16x8_t shift4 = vec_splat_u16(4); + +#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4); + +#define IDCT4(in0, in1, out0, out1) \ + t0 = vec_add(in0, in1); \ + t1 = vec_sub(in0, in1); \ + tmp16_0 = vec_mergeh(t0, t1); \ + temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14); \ + temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14); \ + \ + tmp16_0 = vec_mergel(in0, in1); \ + temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp3); \ + temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \ + DCT_CONST_ROUND_SHIFT(temp4); \ + \ + step0 = vec_packs(temp1, temp2); \ + step1 = vec_packs(temp4, temp3); \ + out0 = vec_add(step0, step1); \ + out1 = vec_sub(step0, step1); \ + out1 = vec_perm(out1, out1, mask0); + +void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int32x4_t temp1, temp2, temp3, temp4; + int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1; + uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; + uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; + int16x8_t v0 = load_tran_low(0, input); + int16x8_t v1 = load_tran_low(8 * sizeof(*input), input); + int16x8_t t0 = vec_mergeh(v0, v1); + int16x8_t t1 = vec_mergel(v0, v1); + + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + uint8x16_t zerov = vec_splat_u8(0); + int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); + int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); + int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); + int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + uint8x16_t output_v; + uint8_t tmp_dest[16]; + ROUND_SHIFT_INIT + PIXEL_ADD_INIT; + + v0 = vec_mergeh(t0, t1); + v1 = vec_mergel(t0, t1); + + IDCT4(v0, v1, t_out0, t_out1); + // transpose + t0 = vec_mergeh(t_out0, t_out1); + t1 = vec_mergel(t_out0, t_out1); + v0 = vec_mergeh(t0, t1); + v1 = vec_mergel(t0, t1); + IDCT4(v0, v1, t_out0, t_out1); + + PIXEL_ADD4(v0, t_out0); + PIXEL_ADD4(v1, t_out1); + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); + output_v = vec_packsu(tmp16_0, tmp16_1); + + vec_vsx_st(output_v, 0, tmp_dest); + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; +} + +#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + out0 = vec_mergeh(in0, in1); \ + out1 = vec_mergel(in0, in1); \ + out2 = vec_mergeh(in2, in3); \ + out3 = vec_mergel(in2, in3); \ + out4 = vec_mergeh(in4, in5); \ + out5 = vec_mergel(in4, in5); \ + out6 = vec_mergeh(in6, in7); \ + out7 = vec_mergel(in6, in7); \ + in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2); \ + in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2); \ + in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3); \ + in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3); \ + in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6); \ + in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6); \ + in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7); \ + in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7); \ + out0 = vec_perm(in0, in4, tr8_mask0); \ + out1 = vec_perm(in0, in4, tr8_mask1); \ + out2 = vec_perm(in1, in5, tr8_mask0); \ + out3 = vec_perm(in1, in5, tr8_mask1); \ + out4 = vec_perm(in2, in6, tr8_mask0); \ + out5 = vec_perm(in2, in6, tr8_mask1); \ + out6 = vec_perm(in3, in7, tr8_mask0); \ + out7 = vec_perm(in3, in7, tr8_mask1); + +/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z + * temp2 = step[x] * cospi_z + step[y] * cospi_q */ +#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \ + temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \ + tmp16_2 = vec_sub(inpt0, inpt1); \ + tmp16_3 = vec_add(inpt0, inpt1); \ + tmp16_0 = vec_mergeh(tmp16_2, tmp16_3); \ + tmp16_1 = vec_mergel(tmp16_2, tmp16_3); \ + temp10 = vec_mule(tmp16_0, cospi); \ + temp11 = vec_mule(tmp16_1, cospi); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_mulo(tmp16_0, cospi); \ + temp11 = vec_mulo(tmp16_1, cospi); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7) \ + /* stage 1 */ \ + step0 = in0; \ + step2 = in4; \ + step1 = in2; \ + step3 = in6; \ + \ + STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v); \ + STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \ + \ + /* stage 2 */ \ + STEP8_1(step0, step2, in1, in0, cospi16_v); \ + STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v); \ + in4 = vec_add(step4, step5); \ + in5 = vec_sub(step4, step5); \ + in6 = vec_sub(step7, step6); \ + in7 = vec_add(step6, step7); \ + \ + /* stage 3 */ \ + step0 = vec_add(in0, in3); \ + step1 = vec_add(in1, in2); \ + step2 = vec_sub(in1, in2); \ + step3 = vec_sub(in0, in3); \ + step4 = in4; \ + STEP8_1(in6, in5, step5, step6, cospi16_v); \ + step7 = in7; \ + \ + /* stage 4 */ \ + in0 = vec_add(step0, step7); \ + in1 = vec_add(step1, step6); \ + in2 = vec_add(step2, step5); \ + in3 = vec_add(step3, step4); \ + in4 = vec_sub(step3, step4); \ + in5 = vec_sub(step2, step5); \ + in6 = vec_sub(step1, step6); \ + in7 = vec_sub(step0, step7); + +#define PIXEL_ADD(in, out, add, shiftx) \ + out = vec_add(vec_sra(vec_add(in, add), shiftx), out); + +static uint8x16_t tr8_mask0 = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +}; +static uint8x16_t tr8_mask1 = { + 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F +}; +void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int32x4_t temp10, temp11; + int16x8_t step0, step1, step2, step3, step4, step5, step6, step7; + int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1, + tmp16_2, tmp16_3; + int16x8_t src0 = load_tran_low(0, input); + int16x8_t src1 = load_tran_low(8 * sizeof(*input), input); + int16x8_t src2 = load_tran_low(16 * sizeof(*input), input); + int16x8_t src3 = load_tran_low(24 * sizeof(*input), input); + int16x8_t src4 = load_tran_low(32 * sizeof(*input), input); + int16x8_t src5 = load_tran_low(40 * sizeof(*input), input); + int16x8_t src6 = load_tran_low(48 * sizeof(*input), input); + int16x8_t src7 = load_tran_low(56 * sizeof(*input), input); + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest); + uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest); + uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest); + uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest); + uint8x16_t zerov = vec_splat_u8(0); + int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); + int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); + int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); + int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov); + int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov); + int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov); + int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov); + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1)); + uint16x8_t shift5 = vec_splat_u16(5); + uint8x16_t output0, output1, output2, output3; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2, + tmp3, tmp4, tmp5, tmp6, tmp7); + + IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2, + src3, src4, src5, src6, src7); + IDCT8(src0, src1, src2, src3, src4, src5, src6, src7); + PIXEL_ADD(src0, d_u0, add, shift5); + PIXEL_ADD(src1, d_u1, add, shift5); + PIXEL_ADD(src2, d_u2, add, shift5); + PIXEL_ADD(src3, d_u3, add, shift5); + PIXEL_ADD(src4, d_u4, add, shift5); + PIXEL_ADD(src5, d_u5, add, shift5); + PIXEL_ADD(src6, d_u6, add, shift5); + PIXEL_ADD(src7, d_u7, add, shift5); + output0 = vec_packsu(d_u0, d_u1); + output1 = vec_packsu(d_u2, d_u3); + output2 = vec_packsu(d_u4, d_u5); + output3 = vec_packsu(d_u6, d_u7); + + vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest); + vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest); + vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest); + vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest); + vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest); + vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest); + vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest); + vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest); +} + +#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \ + in6, in7, in8, in9, inA, inB, inC, inD, inE, inF) \ + in0 = load(offset, source); \ + in1 = load((step) + (offset), source); \ + in2 = load(2 * (step) + (offset), source); \ + in3 = load(3 * (step) + (offset), source); \ + in4 = load(4 * (step) + (offset), source); \ + in5 = load(5 * (step) + (offset), source); \ + in6 = load(6 * (step) + (offset), source); \ + in7 = load(7 * (step) + (offset), source); \ + in8 = load(8 * (step) + (offset), source); \ + in9 = load(9 * (step) + (offset), source); \ + inA = load(10 * (step) + (offset), source); \ + inB = load(11 * (step) + (offset), source); \ + inC = load(12 * (step) + (offset), source); \ + inD = load(13 * (step) + (offset), source); \ + inE = load(14 * (step) + (offset), source); \ + inF = load(15 * (step) + (offset), source); + +#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_mule(tmp16_0, cospi); \ + temp11 = vec_mule(tmp16_1, cospi); \ + temp20 = vec_mulo(tmp16_0, cospi); \ + temp21 = vec_mulo(tmp16_1, cospi); \ + temp30 = vec_sub(temp10, temp20); \ + temp10 = vec_add(temp10, temp20); \ + temp20 = vec_sub(temp11, temp21); \ + temp21 = vec_add(temp11, temp21); \ + DCT_CONST_ROUND_SHIFT(temp30); \ + DCT_CONST_ROUND_SHIFT(temp20); \ + outpt0 = vec_packs(temp30, temp20); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp21); \ + outpt1 = vec_packs(temp10, temp21); + +#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \ + inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, outA, outB, outC, outD, outE, outF) \ + /* stage 1 */ \ + /* out0 = in0; */ \ + out1 = in8; \ + out2 = in4; \ + out3 = inC; \ + out4 = in2; \ + out5 = inA; \ + out6 = in6; \ + out7 = inE; \ + out8 = in1; \ + out9 = in9; \ + outA = in5; \ + outB = inD; \ + outC = in3; \ + outD = inB; \ + outE = in7; \ + outF = inF; \ + \ + /* stage 2 */ \ + /* in0 = out0; */ \ + in1 = out1; \ + in2 = out2; \ + in3 = out3; \ + in4 = out4; \ + in5 = out5; \ + in6 = out6; \ + in7 = out7; \ + \ + STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \ + STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \ + STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \ + STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \ + \ + /* stage 3 */ \ + out0 = in0; \ + out1 = in1; \ + out2 = in2; \ + out3 = in3; \ + \ + STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \ + STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \ + \ + out8 = vec_add(in8, in9); \ + out9 = vec_sub(in8, in9); \ + outA = vec_sub(inB, inA); \ + outB = vec_add(inA, inB); \ + outC = vec_add(inC, inD); \ + outD = vec_sub(inC, inD); \ + outE = vec_sub(inF, inE); \ + outF = vec_add(inE, inF); \ + \ + /* stage 4 */ \ + STEP16_1(out0, out1, in1, in0, cospi16_v); \ + STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \ + in4 = vec_add(out4, out5); \ + in5 = vec_sub(out4, out5); \ + in6 = vec_sub(out7, out6); \ + in7 = vec_add(out6, out7); \ + \ + in8 = out8; \ + inF = outF; \ + tmp16_0 = vec_mergeh(out9, outE); \ + tmp16_1 = vec_mergel(out9, outE); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + in9 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inE = vec_packs(temp10, temp11); \ + \ + tmp16_0 = vec_mergeh(outA, outD); \ + tmp16_1 = vec_mergel(outA, outD); \ + temp10 = \ + vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v)); \ + temp11 = \ + vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inA = vec_packs(temp10, temp11); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inD = vec_packs(temp10, temp11); \ + \ + inB = outB; \ + inC = outC; \ + \ + /* stage 5 */ \ + out0 = vec_add(in0, in3); \ + out1 = vec_add(in1, in2); \ + out2 = vec_sub(in1, in2); \ + out3 = vec_sub(in0, in3); \ + out4 = in4; \ + STEP16_1(in6, in5, out5, out6, cospi16_v); \ + out7 = in7; \ + \ + out8 = vec_add(in8, inB); \ + out9 = vec_add(in9, inA); \ + outA = vec_sub(in9, inA); \ + outB = vec_sub(in8, inB); \ + outC = vec_sub(inF, inC); \ + outD = vec_sub(inE, inD); \ + outE = vec_add(inD, inE); \ + outF = vec_add(inC, inF); \ + \ + /* stage 6 */ \ + in0 = vec_add(out0, out7); \ + in1 = vec_add(out1, out6); \ + in2 = vec_add(out2, out5); \ + in3 = vec_add(out3, out4); \ + in4 = vec_sub(out3, out4); \ + in5 = vec_sub(out2, out5); \ + in6 = vec_sub(out1, out6); \ + in7 = vec_sub(out0, out7); \ + in8 = out8; \ + in9 = out9; \ + STEP16_1(outD, outA, inA, inD, cospi16_v); \ + STEP16_1(outC, outB, inB, inC, cospi16_v); \ + inE = outE; \ + inF = outF; \ + \ + /* stage 7 */ \ + out0 = vec_add(in0, inF); \ + out1 = vec_add(in1, inE); \ + out2 = vec_add(in2, inD); \ + out3 = vec_add(in3, inC); \ + out4 = vec_add(in4, inB); \ + out5 = vec_add(in5, inA); \ + out6 = vec_add(in6, in9); \ + out7 = vec_add(in7, in8); \ + out8 = vec_sub(in7, in8); \ + out9 = vec_sub(in6, in9); \ + outA = vec_sub(in5, inA); \ + outB = vec_sub(in4, inB); \ + outC = vec_sub(in3, inC); \ + outD = vec_sub(in2, inD); \ + outE = vec_sub(in1, inE); \ + outF = vec_sub(in0, inF); + +#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in0, d_uh, add, shift6); \ + PIXEL_ADD(in1, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest); + +void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10, + src11, src12, src13, src14, src15, src16, src17; + int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30, + src31, src32, src33, src34, src35, src36, src37; + int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, + tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1; + int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, + tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37; + uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8, + dest9, destA, destB, destC, destD, destE, destF; + int16x8_t d_uh, d_ul; + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + uint16x8_t shift6 = vec_splat_u16(6); + uint8x16_t zerov = vec_splat_u8(0); + ROUND_SHIFT_INIT; + + // transform rows + // load and transform the upper half of 16x16 matrix + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01, + src11, src02, src12, src03, src13, src04, src14, src05, src15, + src06, src16, src07, src17); + TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, + tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); + TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, + tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); + IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11, + tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03, + src04, src05, src06, src07, src10, src11, src12, src13, src14, src15, + src16, src17); + TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, + tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); + TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, + tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); + + // load and transform the lower half of 16x16 matrix + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), src20, src30, src21, src31, src22, src32, + src23, src33, src24, src34, src25, src35, src26, src36, src27, + src37); + TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, + tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); + TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, + tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); + IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31, + tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23, + src24, src25, src26, src27, src30, src31, src32, src33, src34, src35, + src36, src37); + TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, + tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); + TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, + tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); + + // transform columns + // left half first + IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21, + tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03, + src04, src05, src06, src07, src20, src21, src22, src23, src24, src25, + src26, src27); + // right half + IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31, + tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13, + src14, src15, src16, src17, src30, src31, src32, src33, src34, src35, + src36, src37); + + // load dest + LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4, + dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD, + destE, destF); + + PIXEL_ADD_STORE16(src00, src10, dest0, 0); + PIXEL_ADD_STORE16(src01, src11, dest1, stride); + PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride); + PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride); + PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride); + PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride); + PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride); + PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride); + + PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride); + PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride); + PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride); + PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride); + PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride); + PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride); + PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride); + PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride); +} + +#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \ + in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \ + in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \ + in71, in72, in73, offset) \ + /* load the first row from the 8x32 block*/ \ + in00 = load(offset, input); \ + in01 = load(offset + 16, input); \ + in02 = load(offset + 2 * 16, input); \ + in03 = load(offset + 3 * 16, input); \ + \ + in10 = load(offset + 4 * 16, input); \ + in11 = load(offset + 5 * 16, input); \ + in12 = load(offset + 6 * 16, input); \ + in13 = load(offset + 7 * 16, input); \ + \ + in20 = load(offset + 8 * 16, input); \ + in21 = load(offset + 9 * 16, input); \ + in22 = load(offset + 10 * 16, input); \ + in23 = load(offset + 11 * 16, input); \ + \ + in30 = load(offset + 12 * 16, input); \ + in31 = load(offset + 13 * 16, input); \ + in32 = load(offset + 14 * 16, input); \ + in33 = load(offset + 15 * 16, input); \ + \ + in40 = load(offset + 16 * 16, input); \ + in41 = load(offset + 17 * 16, input); \ + in42 = load(offset + 18 * 16, input); \ + in43 = load(offset + 19 * 16, input); \ + \ + in50 = load(offset + 20 * 16, input); \ + in51 = load(offset + 21 * 16, input); \ + in52 = load(offset + 22 * 16, input); \ + in53 = load(offset + 23 * 16, input); \ + \ + in60 = load(offset + 24 * 16, input); \ + in61 = load(offset + 25 * 16, input); \ + in62 = load(offset + 26 * 16, input); \ + in63 = load(offset + 27 * 16, input); \ + \ + /* load the last row from the 8x32 block*/ \ + in70 = load(offset + 28 * 16, input); \ + in71 = load(offset + 29 * 16, input); \ + in72 = load(offset + 30 * 16, input); \ + in73 = load(offset + 31 * 16, input); + +/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z + * temp2 = step[x] * cospi_z + step[y] * cospi_q */ +#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z + * temp2 = -step[x] * cospi_z + step[y] * cospi_q */ +#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define IDCT32(in0, in1, in2, in3, out) \ + \ + /* stage 1 */ \ + /* out[0][0] = in[0][0]; */ \ + out[0][1] = in2[0]; \ + out[0][2] = in1[0]; \ + out[0][3] = in3[0]; \ + out[0][4] = in0[4]; \ + out[0][5] = in2[4]; \ + out[0][6] = in1[4]; \ + out[0][7] = in3[4]; \ + out[1][0] = in0[2]; \ + out[1][1] = in2[2]; \ + out[1][2] = in1[2]; \ + out[1][3] = in3[2]; \ + out[1][4] = in0[6]; \ + out[1][5] = in2[6]; \ + out[1][6] = in1[6]; \ + out[1][7] = in3[6]; \ + \ + STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v); \ + STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \ + STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v); \ + STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v); \ + STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v); \ + STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \ + STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \ + STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v); \ + \ + /* stage 2 */ \ + /* in0[0] = out[0][0]; */ \ + in0[1] = out[0][1]; \ + in0[2] = out[0][2]; \ + in0[3] = out[0][3]; \ + in0[4] = out[0][4]; \ + in0[5] = out[0][5]; \ + in0[6] = out[0][6]; \ + in0[7] = out[0][7]; \ + \ + STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v); \ + STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \ + STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \ + STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v); \ + \ + in2[0] = vec_add(out[2][0], out[2][1]); \ + in2[1] = vec_sub(out[2][0], out[2][1]); \ + in2[2] = vec_sub(out[2][3], out[2][2]); \ + in2[3] = vec_add(out[2][3], out[2][2]); \ + in2[4] = vec_add(out[2][4], out[2][5]); \ + in2[5] = vec_sub(out[2][4], out[2][5]); \ + in2[6] = vec_sub(out[2][7], out[2][6]); \ + in2[7] = vec_add(out[2][7], out[2][6]); \ + in3[0] = vec_add(out[3][0], out[3][1]); \ + in3[1] = vec_sub(out[3][0], out[3][1]); \ + in3[2] = vec_sub(out[3][3], out[3][2]); \ + in3[3] = vec_add(out[3][3], out[3][2]); \ + in3[4] = vec_add(out[3][4], out[3][5]); \ + in3[5] = vec_sub(out[3][4], out[3][5]); \ + in3[6] = vec_sub(out[3][7], out[3][6]); \ + in3[7] = vec_add(out[3][6], out[3][7]); \ + \ + /* stage 3 */ \ + out[0][0] = in0[0]; \ + out[0][1] = in0[1]; \ + out[0][2] = in0[2]; \ + out[0][3] = in0[3]; \ + \ + STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v); \ + STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \ + \ + out[1][0] = vec_add(in1[0], in1[1]); \ + out[1][1] = vec_sub(in1[0], in1[1]); \ + out[1][2] = vec_sub(in1[3], in1[2]); \ + out[1][3] = vec_add(in1[2], in1[3]); \ + out[1][4] = vec_add(in1[4], in1[5]); \ + out[1][5] = vec_sub(in1[4], in1[5]); \ + out[1][6] = vec_sub(in1[7], in1[6]); \ + out[1][7] = vec_add(in1[6], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[3][7] = in3[7]; \ + STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v); \ + STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v, \ + cospi4m_v); \ + out[2][3] = in2[3]; \ + out[2][4] = in2[4]; \ + STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v); \ + STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \ + cospi20m_v); \ + out[2][7] = in2[7]; \ + out[3][0] = in3[0]; \ + out[3][3] = in3[3]; \ + out[3][4] = in3[4]; \ + \ + /* stage 4 */ \ + STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v); \ + STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v); \ + in0[4] = vec_add(out[0][4], out[0][5]); \ + in0[5] = vec_sub(out[0][4], out[0][5]); \ + in0[6] = vec_sub(out[0][7], out[0][6]); \ + in0[7] = vec_add(out[0][7], out[0][6]); \ + \ + in1[0] = out[1][0]; \ + in1[7] = out[1][7]; \ + STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v); \ + STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v, \ + cospi8m_v); \ + in1[3] = out[1][3]; \ + in1[4] = out[1][4]; \ + \ + in2[0] = vec_add(out[2][0], out[2][3]); \ + in2[1] = vec_add(out[2][1], out[2][2]); \ + in2[2] = vec_sub(out[2][1], out[2][2]); \ + in2[3] = vec_sub(out[2][0], out[2][3]); \ + in2[4] = vec_sub(out[2][7], out[2][4]); \ + in2[5] = vec_sub(out[2][6], out[2][5]); \ + in2[6] = vec_add(out[2][5], out[2][6]); \ + in2[7] = vec_add(out[2][4], out[2][7]); \ + \ + in3[0] = vec_add(out[3][0], out[3][3]); \ + in3[1] = vec_add(out[3][1], out[3][2]); \ + in3[2] = vec_sub(out[3][1], out[3][2]); \ + in3[3] = vec_sub(out[3][0], out[3][3]); \ + in3[4] = vec_sub(out[3][7], out[3][4]); \ + in3[5] = vec_sub(out[3][6], out[3][5]); \ + in3[6] = vec_add(out[3][5], out[3][6]); \ + in3[7] = vec_add(out[3][4], out[3][7]); \ + \ + /* stage 5 */ \ + out[0][0] = vec_add(in0[0], in0[3]); \ + out[0][1] = vec_add(in0[1], in0[2]); \ + out[0][2] = vec_sub(in0[1], in0[2]); \ + out[0][3] = vec_sub(in0[0], in0[3]); \ + out[0][4] = in0[4]; \ + STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v); \ + out[0][7] = in0[7]; \ + \ + out[1][0] = vec_add(in1[0], in1[3]); \ + out[1][1] = vec_add(in1[1], in1[2]); \ + out[1][2] = vec_sub(in1[1], in1[2]); \ + out[1][3] = vec_sub(in1[0], in1[3]); \ + out[1][4] = vec_sub(in1[7], in1[4]); \ + out[1][5] = vec_sub(in1[6], in1[5]); \ + out[1][6] = vec_add(in1[5], in1[6]); \ + out[1][7] = vec_add(in1[4], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[2][1] = in2[1]; \ + STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v); \ + STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v); \ + STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v, \ + cospi8m_v); \ + STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v, \ + cospi8m_v); \ + out[2][6] = in2[6]; \ + out[2][7] = in2[7]; \ + out[3][0] = in3[0]; \ + out[3][1] = in3[1]; \ + out[3][6] = in3[6]; \ + out[3][7] = in3[7]; \ + \ + /* stage 6 */ \ + in0[0] = vec_add(out[0][0], out[0][7]); \ + in0[1] = vec_add(out[0][1], out[0][6]); \ + in0[2] = vec_add(out[0][2], out[0][5]); \ + in0[3] = vec_add(out[0][3], out[0][4]); \ + in0[4] = vec_sub(out[0][3], out[0][4]); \ + in0[5] = vec_sub(out[0][2], out[0][5]); \ + in0[6] = vec_sub(out[0][1], out[0][6]); \ + in0[7] = vec_sub(out[0][0], out[0][7]); \ + in1[0] = out[1][0]; \ + in1[1] = out[1][1]; \ + STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v); \ + STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v); \ + in1[6] = out[1][6]; \ + in1[7] = out[1][7]; \ + \ + in2[0] = vec_add(out[2][0], out[2][7]); \ + in2[1] = vec_add(out[2][1], out[2][6]); \ + in2[2] = vec_add(out[2][2], out[2][5]); \ + in2[3] = vec_add(out[2][3], out[2][4]); \ + in2[4] = vec_sub(out[2][3], out[2][4]); \ + in2[5] = vec_sub(out[2][2], out[2][5]); \ + in2[6] = vec_sub(out[2][1], out[2][6]); \ + in2[7] = vec_sub(out[2][0], out[2][7]); \ + \ + in3[0] = vec_sub(out[3][7], out[3][0]); \ + in3[1] = vec_sub(out[3][6], out[3][1]); \ + in3[2] = vec_sub(out[3][5], out[3][2]); \ + in3[3] = vec_sub(out[3][4], out[3][3]); \ + in3[4] = vec_add(out[3][4], out[3][3]); \ + in3[5] = vec_add(out[3][5], out[3][2]); \ + in3[6] = vec_add(out[3][6], out[3][1]); \ + in3[7] = vec_add(out[3][7], out[3][0]); \ + \ + /* stage 7 */ \ + out[0][0] = vec_add(in0[0], in1[7]); \ + out[0][1] = vec_add(in0[1], in1[6]); \ + out[0][2] = vec_add(in0[2], in1[5]); \ + out[0][3] = vec_add(in0[3], in1[4]); \ + out[0][4] = vec_add(in0[4], in1[3]); \ + out[0][5] = vec_add(in0[5], in1[2]); \ + out[0][6] = vec_add(in0[6], in1[1]); \ + out[0][7] = vec_add(in0[7], in1[0]); \ + out[1][0] = vec_sub(in0[7], in1[0]); \ + out[1][1] = vec_sub(in0[6], in1[1]); \ + out[1][2] = vec_sub(in0[5], in1[2]); \ + out[1][3] = vec_sub(in0[4], in1[3]); \ + out[1][4] = vec_sub(in0[3], in1[4]); \ + out[1][5] = vec_sub(in0[2], in1[5]); \ + out[1][6] = vec_sub(in0[1], in1[6]); \ + out[1][7] = vec_sub(in0[0], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[2][1] = in2[1]; \ + out[2][2] = in2[2]; \ + out[2][3] = in2[3]; \ + STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v); \ + STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v); \ + STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v); \ + STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v); \ + out[3][4] = in3[4]; \ + out[3][5] = in3[5]; \ + out[3][6] = in3[6]; \ + out[3][7] = in3[7]; \ + \ + /* final */ \ + in0[0] = vec_add(out[0][0], out[3][7]); \ + in0[1] = vec_add(out[0][1], out[3][6]); \ + in0[2] = vec_add(out[0][2], out[3][5]); \ + in0[3] = vec_add(out[0][3], out[3][4]); \ + in0[4] = vec_add(out[0][4], out[3][3]); \ + in0[5] = vec_add(out[0][5], out[3][2]); \ + in0[6] = vec_add(out[0][6], out[3][1]); \ + in0[7] = vec_add(out[0][7], out[3][0]); \ + in1[0] = vec_add(out[1][0], out[2][7]); \ + in1[1] = vec_add(out[1][1], out[2][6]); \ + in1[2] = vec_add(out[1][2], out[2][5]); \ + in1[3] = vec_add(out[1][3], out[2][4]); \ + in1[4] = vec_add(out[1][4], out[2][3]); \ + in1[5] = vec_add(out[1][5], out[2][2]); \ + in1[6] = vec_add(out[1][6], out[2][1]); \ + in1[7] = vec_add(out[1][7], out[2][0]); \ + in2[0] = vec_sub(out[1][7], out[2][0]); \ + in2[1] = vec_sub(out[1][6], out[2][1]); \ + in2[2] = vec_sub(out[1][5], out[2][2]); \ + in2[3] = vec_sub(out[1][4], out[2][3]); \ + in2[4] = vec_sub(out[1][3], out[2][4]); \ + in2[5] = vec_sub(out[1][2], out[2][5]); \ + in2[6] = vec_sub(out[1][1], out[2][6]); \ + in2[7] = vec_sub(out[1][0], out[2][7]); \ + in3[0] = vec_sub(out[0][7], out[3][0]); \ + in3[1] = vec_sub(out[0][6], out[3][1]); \ + in3[2] = vec_sub(out[0][5], out[3][2]); \ + in3[3] = vec_sub(out[0][4], out[3][3]); \ + in3[4] = vec_sub(out[0][3], out[3][4]); \ + in3[5] = vec_sub(out[0][2], out[3][5]); \ + in3[6] = vec_sub(out[0][1], out[3][6]); \ + in3[7] = vec_sub(out[0][0], out[3][7]); + +// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row, +// does not transpose rows +#define TRANSPOSE_8x32(in, out) \ + /* transpose 4 of 8x8 blocks */ \ + TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5], \ + in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \ + out[0][4], out[0][5], out[0][6], out[0][7]); \ + TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5], \ + in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \ + out[1][4], out[1][5], out[1][6], out[1][7]); \ + TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5], \ + in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \ + out[2][4], out[2][5], out[2][6], out[2][7]); \ + TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5], \ + in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \ + out[3][4], out[3][5], out[3][6], out[3][7]); + +#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step) \ + dst = vec_vsx_ld((step)*stride, dest); \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in0, d_uh, add, shift6); \ + PIXEL_ADD(in1, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \ + dst = vec_vsx_ld((step)*stride + 16, dest); \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in2, d_uh, add, shift6); \ + PIXEL_ADD(in3, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest); + +#define ADD_STORE_BLOCK(in, offset) \ + PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \ + PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \ + PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \ + PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \ + PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \ + PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \ + PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \ + PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7); + +void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8]; + int16x8_t tmp16_0, tmp16_1; + int32x4_t temp10, temp11, temp20, temp21, temp30; + uint8x16_t dst; + int16x8_t d_uh, d_ul; + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + uint16x8_t shift6 = vec_splat_u16(6); + uint8x16_t zerov = vec_splat_u8(0); + + ROUND_SHIFT_INIT; + + LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0], + src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2], + src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3], + src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4], + src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5], + src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7], + src0[1][7], src0[2][7], src0[3][7], 0); + // Rows + // transpose the first row of 8x8 blocks + TRANSPOSE_8x32(src0, tmp); + // transform the 32x8 column + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0); + TRANSPOSE_8x32(tmp, src0); + + LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0], + src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2], + src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3], + src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4], + src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5], + src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7], + src1[1][7], src1[2][7], src1[3][7], 512); + TRANSPOSE_8x32(src1, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1); + TRANSPOSE_8x32(tmp, src1); + + LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0], + src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2], + src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3], + src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4], + src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5], + src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7], + src2[1][7], src2[2][7], src2[3][7], 1024); + TRANSPOSE_8x32(src2, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2); + TRANSPOSE_8x32(tmp, src2); + + LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0], + src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2], + src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3], + src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4], + src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5], + src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7], + src3[1][7], src3[2][7], src3[3][7], 1536); + TRANSPOSE_8x32(src3, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3); + TRANSPOSE_8x32(tmp, src3); + + // Columns + IDCT32(src0[0], src1[0], src2[0], src3[0], tmp); + IDCT32(src0[1], src1[1], src2[1], src3[1], tmp); + IDCT32(src0[2], src1[2], src2[2], src3[2], tmp); + IDCT32(src0[3], src1[3], src2[3], src3[3], tmp); + + ADD_STORE_BLOCK(src0, 0); + ADD_STORE_BLOCK(src1, 8); + ADD_STORE_BLOCK(src2, 16); + ADD_STORE_BLOCK(src3, 24); +} diff --git a/libs/libvpx/vpx_dsp/ppc/sad_vsx.c b/libs/libvpx/vpx_dsp/ppc/sad_vsx.c new file mode 100644 index 0000000000..bb49addae1 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/sad_vsx.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/ppc/types_vsx.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define PROCESS16(offset) \ + v_a = vec_vsx_ld(offset, a); \ + v_b = vec_vsx_ld(offset, b); \ + v_ah = unpack_to_s16_h(v_a); \ + v_al = unpack_to_s16_l(v_a); \ + v_bh = unpack_to_s16_h(v_b); \ + v_bl = unpack_to_s16_l(v_b); \ + v_subh = vec_sub(v_ah, v_bh); \ + v_subl = vec_sub(v_al, v_bl); \ + v_absh = vec_abs(v_subh); \ + v_absl = vec_abs(v_subl); \ + v_sad = vec_sum4s(v_absh, v_sad); \ + v_sad = vec_sum4s(v_absl, v_sad); + +#define SAD16(height) \ + unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + int32x4_t v_sad = vec_splat_s32(0); \ + \ + for (y = 0; y < height; y++) { \ + PROCESS16(0); \ + \ + a += a_stride; \ + b += b_stride; \ + } \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + \ + return sad[3] + sad[2] + sad[1] + sad[0]; \ + } + +#define SAD32(height) \ + unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + int32x4_t v_sad = vec_splat_s32(0); \ + \ + for (y = 0; y < height; y++) { \ + PROCESS16(0); \ + PROCESS16(16); \ + \ + a += a_stride; \ + b += b_stride; \ + } \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + \ + return sad[3] + sad[2] + sad[1] + sad[0]; \ + } + +#define SAD64(height) \ + unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + int32x4_t v_sad = vec_splat_s32(0); \ + \ + for (y = 0; y < height; y++) { \ + PROCESS16(0); \ + PROCESS16(16); \ + PROCESS16(32); \ + PROCESS16(48); \ + \ + a += a_stride; \ + b += b_stride; \ + } \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + \ + return sad[3] + sad[2] + sad[1] + sad[0]; \ + } + +SAD16(8); +SAD16(16); +SAD16(32); +SAD32(16); +SAD32(32); +SAD32(64); +SAD64(32); +SAD64(64); + +#define SAD16AVG(height) \ + unsigned int vpx_sad16x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * height]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \ + ref_stride); \ + \ + return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16); \ + } + +#define SAD32AVG(height) \ + unsigned int vpx_sad32x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * height]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \ + ref_stride); \ + \ + return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32); \ + } + +#define SAD64AVG(height) \ + unsigned int vpx_sad64x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * height]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \ + ref_stride); \ + return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \ + } + +SAD16AVG(8); +SAD16AVG(16); +SAD16AVG(32); +SAD32AVG(16); +SAD32AVG(32); +SAD32AVG(64); +SAD64AVG(32); +SAD64AVG(64); + +#define PROCESS16_4D(offset, ref, v_h, v_l) \ + v_b = vec_vsx_ld(offset, ref); \ + v_bh = unpack_to_s16_h(v_b); \ + v_bl = unpack_to_s16_l(v_b); \ + v_subh = vec_sub(v_h, v_bh); \ + v_subl = vec_sub(v_l, v_bl); \ + v_absh = vec_abs(v_subh); \ + v_absl = vec_abs(v_subl); \ + v_sad = vec_sum4s(v_absh, v_sad); \ + v_sad = vec_sum4s(v_absl, v_sad); + +#define UNPACK_SRC(offset, srcv_h, srcv_l) \ + v_a = vec_vsx_ld(offset, src); \ + srcv_h = unpack_to_s16_h(v_a); \ + srcv_l = unpack_to_s16_l(v_a); + +#define SAD16_4D(height) \ + void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah, v_al); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +#define SAD32_4D(height) \ + void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ + int16x8_t v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ + UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ + PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +#define SAD64_4D(height) \ + void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ + int16x8_t v_ah3, v_al3, v_ah4, v_al4; \ + int16x8_t v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ + UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ + UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3); \ + UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ + PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ + PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3); \ + PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +SAD16_4D(8); +SAD16_4D(16); +SAD16_4D(32); +SAD32_4D(16); +SAD32_4D(32); +SAD32_4D(64); +SAD64_4D(32); +SAD64_4D(64); diff --git a/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h b/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h new file mode 100644 index 0000000000..f02556d522 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_PPC_TRANSPOSE_VSX_H_ +#define VPX_DSP_PPC_TRANSPOSE_VSX_H_ + +#include "./vpx_config.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) { + // d = vec_mergeh(a,b): + // The even elements of the result are obtained left-to-right, + // from the high elements of a. + // The odd elements of the result are obtained left-to-right, + // from the high elements of b. + // + // d = vec_mergel(a,b): + // The even elements of the result are obtained left-to-right, + // from the low elements of a. + // The odd elements of the result are obtained left-to-right, + // from the low elements of b. + + // Example, starting with: + // v[0]: 00 01 02 03 04 05 06 07 + // v[1]: 10 11 12 13 14 15 16 17 + // v[2]: 20 21 22 23 24 25 26 27 + // v[3]: 30 31 32 33 34 35 36 37 + // v[4]: 40 41 42 43 44 45 46 47 + // v[5]: 50 51 52 53 54 55 56 57 + // v[6]: 60 61 62 63 64 65 66 67 + // v[7]: 70 71 72 73 74 75 76 77 + + int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + + b0 = vec_mergeh(v[0], v[4]); + b1 = vec_mergel(v[0], v[4]); + b2 = vec_mergeh(v[1], v[5]); + b3 = vec_mergel(v[1], v[5]); + b4 = vec_mergeh(v[2], v[6]); + b5 = vec_mergel(v[2], v[6]); + b6 = vec_mergeh(v[3], v[7]); + b7 = vec_mergel(v[3], v[7]); + + // After first merge operation + // b0: 00 40 01 41 02 42 03 43 + // b1: 04 44 05 45 06 46 07 47 + // b2: 10 50 11 51 12 52 13 53 + // b3: 14 54 15 55 16 56 17 57 + // b4: 20 60 21 61 22 62 23 63 + // b5: 24 64 25 65 26 66 27 67 + // b6: 30 70 31 71 32 62 33 73 + // b7: 34 74 35 75 36 76 37 77 + + c0 = vec_mergeh(b0, b4); + c1 = vec_mergel(b0, b4); + c2 = vec_mergeh(b1, b5); + c3 = vec_mergel(b1, b5); + c4 = vec_mergeh(b2, b6); + c5 = vec_mergel(b2, b6); + c6 = vec_mergeh(b3, b7); + c7 = vec_mergel(b3, b7); + + // After second merge operation + // c0: 00 20 40 60 01 21 41 61 + // c1: 02 22 42 62 03 23 43 63 + // c2: 04 24 44 64 05 25 45 65 + // c3: 06 26 46 66 07 27 47 67 + // c4: 10 30 50 70 11 31 51 71 + // c5: 12 32 52 72 13 33 53 73 + // c6: 14 34 54 74 15 35 55 75 + // c7: 16 36 56 76 17 37 57 77 + + v[0] = vec_mergeh(c0, c4); + v[1] = vec_mergel(c0, c4); + v[2] = vec_mergeh(c1, c5); + v[3] = vec_mergel(c1, c5); + v[4] = vec_mergeh(c2, c6); + v[5] = vec_mergel(c2, c6); + v[6] = vec_mergeh(c3, c7); + v[7] = vec_mergel(c3, c7); + + // After last merge operation + // v[0]: 00 10 20 30 40 50 60 70 + // v[1]: 01 11 21 31 41 51 61 71 + // v[2]: 02 12 22 32 42 52 62 72 + // v[3]: 03 13 23 33 43 53 63 73 + // v[4]: 04 14 24 34 44 54 64 74 + // v[5]: 05 15 25 35 45 55 65 75 + // v[6]: 06 16 26 36 46 56 66 76 + // v[7]: 07 17 27 37 47 57 67 77 +} + +#endif // VPX_DSP_PPC_TRANSPOSE_VSX_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/types_vsx.h b/libs/libvpx/vpx_dsp/ppc/types_vsx.h new file mode 100644 index 0000000000..f611d02d2d --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/types_vsx.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_PPC_TYPES_VSX_H_ +#define VPX_DSP_PPC_TYPES_VSX_H_ + +#include + +typedef vector signed char int8x16_t; +typedef vector unsigned char uint8x16_t; +typedef vector signed short int16x8_t; +typedef vector unsigned short uint16x8_t; +typedef vector signed int int32x4_t; +typedef vector unsigned int uint32x4_t; + +#ifdef __clang__ +static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17 }; +static const uint8x16_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; +static const uint8x16_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17 }; +static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; +#define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm) +#elif defined(__GNUC__) && \ + (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3)) +#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) +#endif + +#ifdef WORDS_BIGENDIAN +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#ifndef xxpermdi +#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) +#endif +#else +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#ifndef xxpermdi +#define xxpermdi(a, b, c) vec_xxpermdi(b, a, ((c >> 1) | (c & 1) << 1) ^ 3) +#endif +#endif + +#endif // VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/libs/libvpx/vpx_dsp/ppc/variance_vsx.c b/libs/libvpx/vpx_dsp/ppc/variance_vsx.c new file mode 100644 index 0000000000..1efe2f0056 --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/variance_vsx.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static inline uint8x16_t read4x2(const uint8_t *a, int stride) { + const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); + const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); + + return (uint8x16_t)vec_mergeh(a0, a1); +} + +uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride) { + int distortion; + + const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride)); + const int16x8_t a1 = unpack_to_s16_h(read4x2(a + a_stride * 2, a_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride)); + const int16x8_t b1 = unpack_to_s16_h(read4x2(b + b_stride * 2, b_stride)); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0))); + const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3); + + vec_ste(d, 0, &distortion); + + return distortion; +} + +// TODO(lu_zero): Unroll +uint32_t vpx_get_mb_ss_vsx(const int16_t *a) { + unsigned int i, sum = 0; + int32x4_t s = vec_splat_s32(0); + + for (i = 0; i < 256; i += 8) { + const int16x8_t v = vec_vsx_ld(0, a + i); + s = vec_msum(v, v, s); + } + + s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3); + + vec_ste((uint32x4_t)s, 0, &sum); + + return sum; +} + +void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width >= 16) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref)); + vec_vsx_st(v, j, comp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + // Process 2 lines at time + for (i = 0; i < height / 2; ++i) { + const uint8x16_t r0 = vec_vsx_ld(0, ref); + const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride); + const uint8x16_t r = xxpermdi(r0, r1, 0); + const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r); + vec_vsx_st(v, 0, comp_pred); + comp_pred += 16; // width * 2; + pred += 16; // width * 2; + ref += ref_stride * 2; + } + } else { + assert(width == 4); + // process 4 lines at time + for (i = 0; i < height / 4; ++i) { + const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref); + const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride); + const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2); + const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3); + const uint8x16_t r = + (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0); + const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r); + vec_vsx_st(v, 0, comp_pred); + comp_pred += 16; // width * 4; + pred += 16; // width * 4; + ref += ref_stride * 4; + } + } +} diff --git a/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c new file mode 100644 index 0000000000..5c3ba4576f --- /dev/null +++ b/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// TODO(lu_zero): unroll +static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + src += src_stride; + dst += dst_stride; + } +} + +static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + src += src_stride; + dst += dst_stride; + } +} + +static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); + vec_vsx_st(vec_vsx_ld(48, src), 48, dst); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 16: { + copy_w16(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_w32(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_w64(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int i; + for (i = h; i--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} + +static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + vec_vsx_st(v, 0, dst); + src += src_stride; + dst += dst_stride; + } +} + +static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); + vec_vsx_st(v0, 0, dst); + vec_vsx_st(v1, 16, dst); + src += src_stride; + dst += dst_stride; + } +} + +static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); + const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst)); + const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst)); + vec_vsx_st(v0, 0, dst); + vec_vsx_st(v1, 16, dst); + vec_vsx_st(v2, 32, dst); + vec_vsx_st(v3, 48, dst); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + switch (w) { + case 16: { + avg_w16(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_w32(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_w64(src, src_stride, dst, dst_stride, h); + break; + } + default: { + vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} + +static inline void convolve_line(uint8_t *dst, const int16x8_t s, + const int16x8_t f) { + const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0)); + const int32x4_t bias = + vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1)); + const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS)); + const uint8x16_t v = vec_splat( + vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3); + vec_ste(v, 0, dst); +} + +static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x, + const int16_t *const x_filter) { + const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x)); + const int16x8_t f = vec_vsx_ld(0, x_filter); + + convolve_line(dst, s, f); +} + +// TODO(lu_zero): Implement 8x8 and bigger block special cases +static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS], + x_filters[x_q4 & SUBPEL_MASK]); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + uint8_t v; + convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS], + x_filters[x_q4 & SUBPEL_MASK]); + dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b, + uint8x16_t c, uint8x16_t d, + uint8x16_t e, uint8x16_t f, + uint8x16_t g, uint8x16_t h) { + uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b); + uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d); + uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f); + uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h); + + uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd); + uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh); + + return (uint8x16_t)vec_mergeh(abcd, efgh); +} + +static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y, + ptrdiff_t src_stride, + const int16_t *const y_filter) { + uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride); + uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride); + uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride); + uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride); + uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride); + uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride); + uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride); + uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride); + const int16x8_t f = vec_vsx_ld(0, y_filter); + uint8_t buf[16]; + const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7); + + vec_vsx_st(s, 0, buf); + + convolve_line(dst, unpack_to_s16_h(s), f); +} + +static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + convolve_line_v(dst + y * dst_stride, + &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, + y_filters[y_q4 & SUBPEL_MASK]); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + uint8_t v; + convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, + y_filters[y_q4 & SUBPEL_MASK]); + dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); +} + +void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); +} + +void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); +} + +void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); +} + +void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); + vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); +} diff --git a/libs/libvpx/vpx_dsp/prob.h b/libs/libvpx/vpx_dsp/prob.h index 3127a00bb9..f1cc0eaa10 100644 --- a/libs/libvpx/vpx_dsp/prob.h +++ b/libs/libvpx/vpx_dsp/prob.h @@ -11,6 +11,8 @@ #ifndef VPX_DSP_PROB_H_ #define VPX_DSP_PROB_H_ +#include + #include "./vpx_config.h" #include "./vpx_dsp_common.h" @@ -43,17 +45,20 @@ typedef int8_t vpx_tree_index; typedef const vpx_tree_index vpx_tree[]; -static INLINE vpx_prob clip_prob(int p) { - return (p > 255) ? 255 : (p < 1) ? 1 : p; -} - static INLINE vpx_prob get_prob(unsigned int num, unsigned int den) { - if (den == 0) return 128u; - return clip_prob((int)(((int64_t)num * 256 + (den >> 1)) / den)); + assert(den != 0); + { + const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); + // (p > 255) ? 255 : (p < 1) ? 1 : p; + const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); + return (vpx_prob)clipped_prob; + } } static INLINE vpx_prob get_binary_prob(unsigned int n0, unsigned int n1) { - return get_prob(n0, n0 + n1); + const unsigned int den = n0 + n1; + if (den == 0) return 128u; + return get_prob(n0, den); } /* This function assumes prob1 and prob2 are already within [1,255] range. */ diff --git a/libs/libvpx/vpx_dsp/quantize.c b/libs/libvpx/vpx_dsp/quantize.c index 3c7f9832f7..e37ca92ad4 100644 --- a/libs/libvpx/vpx_dsp/quantize.c +++ b/libs/libvpx/vpx_dsp/quantize.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/quantize.h" #include "vpx_mem/vpx_mem.h" @@ -123,40 +125,40 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if (abs_coeff >= zbins[rc != 0]) { - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - 16; // quantization - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_coeff >= zbins[rc != 0]) { + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (tmp) eob = i; - } + if (tmp) eob = i; } } *eob_ptr = eob + 1; @@ -174,39 +176,38 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if (abs_coeff >= zbins[rc != 0]) { - const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) eob = i; - } + if (abs_coeff >= zbins[rc != 0]) { + const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; } } *eob_ptr = eob + 1; @@ -228,41 +229,40 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int idx_arr[1024]; int i, eob = -1; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - int tmp; - int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); - tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * - quant_shift_ptr[rc != 0]) >> - 15; + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * + quant_shift_ptr[rc != 0]) >> + 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (tmp) eob = idx_arr[i]; - } + if (tmp) eob = idx_arr[i]; } *eob_ptr = eob + 1; } @@ -282,38 +282,35 @@ void vpx_highbd_quantize_b_32x32_c( int idx_arr[1024]; int i, eob = -1; (void)iscan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = idx_arr[i]; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = idx_arr[i]; } *eob_ptr = eob + 1; } diff --git a/libs/libvpx/vpx_dsp/sad.c b/libs/libvpx/vpx_dsp/sad.c index c80ef729bf..18b6dc6e09 100644 --- a/libs/libvpx/vpx_dsp/sad.c +++ b/libs/libvpx/vpx_dsp/sad.c @@ -39,7 +39,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ - uint8_t comp_pred[m * n]; \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ return sad(src, src_stride, comp_pred, m, m, n); \ } @@ -70,8 +70,6 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, /* clang-format off */ // 64x64 sadMxN(64, 64) -sadMxNxK(64, 64, 3) -sadMxNxK(64, 64, 8) sadMxNx4D(64, 64) // 64x32 @@ -84,8 +82,6 @@ sadMxNx4D(32, 64) // 32x32 sadMxN(32, 32) -sadMxNxK(32, 32, 3) -sadMxNxK(32, 32, 8) sadMxNx4D(32, 32) // 32x16 @@ -122,12 +118,10 @@ sadMxNx4D(8, 8) // 8x4 sadMxN(8, 4) -sadMxNxK(8, 4, 8) sadMxNx4D(8, 4) // 4x8 sadMxN(4, 8) -sadMxNxK(4, 8, 8) sadMxNx4D(4, 8) // 4x4 @@ -178,22 +172,11 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, unsigned int vpx_highbd_sad##m##x##n##_avg_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ - uint16_t comp_pred[m * n]; \ + DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \ vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } -#define highbd_sadMxNxK(m, n, k) \ - void vpx_highbd_sad##m##x##n##x##k##_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref_array, \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \ - &ref_array[i], ref_stride); \ - } \ - } - #define highbd_sadMxNx4D(m, n) \ void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[], \ @@ -208,8 +191,6 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, /* clang-format off */ // 64x64 highbd_sadMxN(64, 64) -highbd_sadMxNxK(64, 64, 3) -highbd_sadMxNxK(64, 64, 8) highbd_sadMxNx4D(64, 64) // 64x32 @@ -222,8 +203,6 @@ highbd_sadMxNx4D(32, 64) // 32x32 highbd_sadMxN(32, 32) -highbd_sadMxNxK(32, 32, 3) -highbd_sadMxNxK(32, 32, 8) highbd_sadMxNx4D(32, 32) // 32x16 @@ -236,42 +215,30 @@ highbd_sadMxNx4D(16, 32) // 16x16 highbd_sadMxN(16, 16) -highbd_sadMxNxK(16, 16, 3) -highbd_sadMxNxK(16, 16, 8) highbd_sadMxNx4D(16, 16) // 16x8 highbd_sadMxN(16, 8) -highbd_sadMxNxK(16, 8, 3) -highbd_sadMxNxK(16, 8, 8) highbd_sadMxNx4D(16, 8) // 8x16 highbd_sadMxN(8, 16) -highbd_sadMxNxK(8, 16, 3) -highbd_sadMxNxK(8, 16, 8) highbd_sadMxNx4D(8, 16) // 8x8 highbd_sadMxN(8, 8) -highbd_sadMxNxK(8, 8, 3) -highbd_sadMxNxK(8, 8, 8) highbd_sadMxNx4D(8, 8) // 8x4 highbd_sadMxN(8, 4) -highbd_sadMxNxK(8, 4, 8) highbd_sadMxNx4D(8, 4) // 4x8 highbd_sadMxN(4, 8) -highbd_sadMxNxK(4, 8, 8) highbd_sadMxNx4D(4, 8) // 4x4 highbd_sadMxN(4, 4) -highbd_sadMxNxK(4, 4, 3) -highbd_sadMxNxK(4, 4, 8) highbd_sadMxNx4D(4, 4) /* clang-format on */ diff --git a/libs/libvpx/vpx_dsp/skin_detection.c b/libs/libvpx/vpx_dsp/skin_detection.c new file mode 100644 index 0000000000..bbbb6c3a17 --- /dev/null +++ b/libs/libvpx/vpx_dsp/skin_detection.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/skin_detection.h" + +#define MODEL_MODE 1 + +// Fixed-point skin color model parameters. +static const int skin_mean[5][2] = { { 7463, 9614 }, + { 6400, 10240 }, + { 7040, 10240 }, + { 8320, 9280 }, + { 6800, 9614 } }; +static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 +static const int skin_threshold[6] = { 1570636, 1400000, 800000, + 800000, 800000, 800000 }; // q18 +// Thresholds on luminance. +static const int y_low = 40; +static const int y_high = 220; + +// Evaluates the Mahalanobis distance measure for the input CbCr values. +static int vpx_evaluate_skin_color_difference(const int cb, const int cr, + const int idx) { + const int cb_q6 = cb << 6; + const int cr_q6 = cr << 6; + const int cb_diff_q12 = + (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); + const int cbcr_diff_q12 = + (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); + const int cr_diff_q12 = + (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); + const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; + const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; + const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; + const int skin_diff = + skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + + skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; + return skin_diff; +} + +// Checks if the input yCbCr values corresponds to skin color. +int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) { + if (y < y_low || y > y_high) { + return 0; + } else if (MODEL_MODE == 0) { + return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); + } else { + int i = 0; + // Exit on grey. + if (cb == 128 && cr == 128) return 0; + // Exit on very strong cb. + if (cb > 150 && cr < 110) return 0; + for (; i < 5; ++i) { + int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i); + if (skin_color_diff < skin_threshold[i + 1]) { + if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) { + return 0; + } else if (motion == 0 && + skin_color_diff > (skin_threshold[i + 1] >> 1)) { + return 0; + } else { + return 1; + } + } + // Exit if difference is much large than the threshold. + if (skin_color_diff > (skin_threshold[i + 1] << 3)) { + return 0; + } + } + return 0; + } +} diff --git a/libs/libvpx/vpx_dsp/skin_detection.h b/libs/libvpx/vpx_dsp/skin_detection.h new file mode 100644 index 0000000000..a2e99baf7e --- /dev/null +++ b/libs/libvpx/vpx_dsp/skin_detection.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_SKIN_DETECTION_H_ +#define VPX_DSP_SKIN_DETECTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +int vpx_skin_pixel(const int y, const int cb, const int cr, int motion); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_SKIN_DETECTION_H_ diff --git a/libs/libvpx/vpx_dsp/txfm_common.h b/libs/libvpx/vpx_dsp/txfm_common.h index fd27f928ed..d01d7085a2 100644 --- a/libs/libvpx/vpx_dsp/txfm_common.h +++ b/libs/libvpx/vpx_dsp/txfm_common.h @@ -25,42 +25,42 @@ // printf("static const int cospi_%d_64 = %.0f;\n", i, // round(16384 * cos(i*M_PI/64))); // Note: sin(k*Pi/64) = cos((32-k)*Pi/64) -static const tran_high_t cospi_1_64 = 16364; -static const tran_high_t cospi_2_64 = 16305; -static const tran_high_t cospi_3_64 = 16207; -static const tran_high_t cospi_4_64 = 16069; -static const tran_high_t cospi_5_64 = 15893; -static const tran_high_t cospi_6_64 = 15679; -static const tran_high_t cospi_7_64 = 15426; -static const tran_high_t cospi_8_64 = 15137; -static const tran_high_t cospi_9_64 = 14811; -static const tran_high_t cospi_10_64 = 14449; -static const tran_high_t cospi_11_64 = 14053; -static const tran_high_t cospi_12_64 = 13623; -static const tran_high_t cospi_13_64 = 13160; -static const tran_high_t cospi_14_64 = 12665; -static const tran_high_t cospi_15_64 = 12140; -static const tran_high_t cospi_16_64 = 11585; -static const tran_high_t cospi_17_64 = 11003; -static const tran_high_t cospi_18_64 = 10394; -static const tran_high_t cospi_19_64 = 9760; -static const tran_high_t cospi_20_64 = 9102; -static const tran_high_t cospi_21_64 = 8423; -static const tran_high_t cospi_22_64 = 7723; -static const tran_high_t cospi_23_64 = 7005; -static const tran_high_t cospi_24_64 = 6270; -static const tran_high_t cospi_25_64 = 5520; -static const tran_high_t cospi_26_64 = 4756; -static const tran_high_t cospi_27_64 = 3981; -static const tran_high_t cospi_28_64 = 3196; -static const tran_high_t cospi_29_64 = 2404; -static const tran_high_t cospi_30_64 = 1606; -static const tran_high_t cospi_31_64 = 804; +static const tran_coef_t cospi_1_64 = 16364; +static const tran_coef_t cospi_2_64 = 16305; +static const tran_coef_t cospi_3_64 = 16207; +static const tran_coef_t cospi_4_64 = 16069; +static const tran_coef_t cospi_5_64 = 15893; +static const tran_coef_t cospi_6_64 = 15679; +static const tran_coef_t cospi_7_64 = 15426; +static const tran_coef_t cospi_8_64 = 15137; +static const tran_coef_t cospi_9_64 = 14811; +static const tran_coef_t cospi_10_64 = 14449; +static const tran_coef_t cospi_11_64 = 14053; +static const tran_coef_t cospi_12_64 = 13623; +static const tran_coef_t cospi_13_64 = 13160; +static const tran_coef_t cospi_14_64 = 12665; +static const tran_coef_t cospi_15_64 = 12140; +static const tran_coef_t cospi_16_64 = 11585; +static const tran_coef_t cospi_17_64 = 11003; +static const tran_coef_t cospi_18_64 = 10394; +static const tran_coef_t cospi_19_64 = 9760; +static const tran_coef_t cospi_20_64 = 9102; +static const tran_coef_t cospi_21_64 = 8423; +static const tran_coef_t cospi_22_64 = 7723; +static const tran_coef_t cospi_23_64 = 7005; +static const tran_coef_t cospi_24_64 = 6270; +static const tran_coef_t cospi_25_64 = 5520; +static const tran_coef_t cospi_26_64 = 4756; +static const tran_coef_t cospi_27_64 = 3981; +static const tran_coef_t cospi_28_64 = 3196; +static const tran_coef_t cospi_29_64 = 2404; +static const tran_coef_t cospi_30_64 = 1606; +static const tran_coef_t cospi_31_64 = 804; // 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 -static const tran_high_t sinpi_1_9 = 5283; -static const tran_high_t sinpi_2_9 = 9929; -static const tran_high_t sinpi_3_9 = 13377; -static const tran_high_t sinpi_4_9 = 15212; +static const tran_coef_t sinpi_1_9 = 5283; +static const tran_coef_t sinpi_2_9 = 9929; +static const tran_coef_t sinpi_3_9 = 13377; +static const tran_coef_t sinpi_4_9 = 15212; #endif // VPX_DSP_TXFM_COMMON_H_ diff --git a/libs/libvpx/vpx_dsp/variance.c b/libs/libvpx/vpx_dsp/variance.c index d1fa0560d5..93bd8f30de 100644 --- a/libs/libvpx/vpx_dsp/variance.c +++ b/libs/libvpx/vpx_dsp/variance.c @@ -164,7 +164,7 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters[yoffset]); \ \ - vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ \ return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ } @@ -294,7 +294,7 @@ static void highbd_12_variance(const uint8_t *a8, int a_stride, uint32_t *sse) { \ int sum; \ highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } \ \ uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ @@ -463,8 +463,8 @@ static void highbd_var_filter_block2d_bil_second_pass( highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ @@ -483,8 +483,8 @@ static void highbd_var_filter_block2d_bil_second_pass( highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ @@ -503,8 +503,8 @@ static void highbd_var_filter_block2d_bil_second_pass( highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ diff --git a/libs/libvpx/vpx_dsp/variance.h b/libs/libvpx/vpx_dsp/variance.h index 4c482551e0..100573299b 100644 --- a/libs/libvpx/vpx_dsp/variance.h +++ b/libs/libvpx/vpx_dsp/variance.h @@ -74,8 +74,6 @@ typedef struct vp9_variance_vtable { vpx_variance_fn_t vf; vpx_subpixvariance_fn_t svf; vpx_subp_avg_variance_fn_t svaf; - vpx_sad_multi_fn_t sdx3f; - vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; #endif // CONFIG_VP9 diff --git a/libs/libvpx/vpx_dsp/vpx_convolve.c b/libs/libvpx/vpx_dsp/vpx_convolve.c index f17281b369..e55a963f9d 100644 --- a/libs/libvpx/vpx_dsp/vpx_convolve.c +++ b/libs/libvpx/vpx_dsp/vpx_convolve.c @@ -25,6 +25,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -46,6 +47,7 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -72,7 +74,7 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) @@ -95,7 +97,7 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) @@ -111,11 +113,52 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, int y0_q4, - int y_step_q4, int w, int h) { +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)y0_q4; + (void)y_step_q4; + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); +} + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); +} + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)x0_q4; + (void)x_step_q4; + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); +} + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); +} + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -128,129 +171,49 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint8_t temp[135 * 64]; - int intermediate_height = + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + uint8_t temp[64 * 135]; + const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, - x_filters, x0_q4, x_step_q4, w, intermediate_height); - convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h); -} - -static const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; - (void)y_step_q4; - - convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - w, h); -} - -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; - (void)y_step_q4; - - convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); -} - -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; - (void)x_step_q4; - - convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, - w, h); -} - -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; - (void)x_step_q4; - - convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); -} - -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); } void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - /* Fixed size intermediate buffer places limits on parameters. */ + // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, + vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); - vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); } void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { int r; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; for (r = h; r > 0; --r) { memcpy(dst, src, w); @@ -260,83 +223,81 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); - src += src_stride; dst += dst_stride; } } void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } #if CONFIG_VP9_HIGHBITDEPTH -static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -352,14 +313,13 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -377,14 +337,13 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { @@ -402,14 +361,13 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { @@ -429,11 +387,11 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h, int bd) { +static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -447,7 +405,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. uint16_t temp[64 * 135]; - int intermediate_height = + const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); @@ -456,112 +414,97 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 <= 32); highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4, - x_step_q4, w, intermediate_height, bd); - highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1), - 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, - bd); + temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - (void)filter_y; +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)y0_q4; (void)y_step_q4; - highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, h, bd); } -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h, bd); + highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, - 0, NULL, 0, w, h, bd); + vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h, + bd); } -void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { int r; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; for (r = h; r > 0; --r) { @@ -571,24 +514,22 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); - } + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); src += src_stride; dst += dst_stride; } diff --git a/libs/libvpx/vpx_dsp/vpx_convolve.h b/libs/libvpx/vpx_dsp/vpx_convolve.h index ee9744b3ae..7979268a95 100644 --- a/libs/libvpx/vpx_dsp/vpx_convolve.h +++ b/libs/libvpx/vpx_dsp/vpx_convolve.h @@ -19,15 +19,15 @@ extern "C" { typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #if CONFIG_VP9_HIGHBITDEPTH -typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); #endif diff --git a/libs/libvpx/vpx_dsp/vpx_dsp.mk b/libs/libvpx/vpx_dsp/vpx_dsp.mk index a78041ce7b..3b1a873cd2 100644 --- a/libs/libvpx/vpx_dsp/vpx_dsp.mk +++ b/libs/libvpx/vpx_dsp/vpx_dsp.mk @@ -13,6 +13,13 @@ DSP_SRCS-yes += vpx_dsp_common.h DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h +DSP_SRCS-$(HAVE_AVX2) += x86/bitdepth_conversion_avx2.h +DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h +# This file is included in libs.mk. Including it here would cause it to be +# compiled into an object. Even as an empty file, this would create an +# executable section on the stack. +#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM) + # bit reader DSP_SRCS-yes += prob.h DSP_SRCS-yes += prob.c @@ -43,11 +50,14 @@ DSP_SRCS-yes += intrapred.c DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c endif # CONFIG_VP9_HIGHBITDEPTH ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) @@ -56,6 +66,7 @@ DSP_SRCS-yes += deblock.c DSP_SRCS-yes += postproc.h DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c +DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm endif # CONFIG_POSTPROC @@ -77,6 +88,8 @@ DSP_SRCS-yes += vpx_filter.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c +DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h +DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm @@ -86,9 +99,15 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c endif DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/vpx_scaled_convolve8_neon.c ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM) @@ -99,7 +118,6 @@ DSP_SRCS-yes += arm/vpx_convolve_neon.c else ifeq ($(HAVE_NEON),yes) DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c -DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c DSP_SRCS-yes += arm/vpx_convolve8_neon.c DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c DSP_SRCS-yes += arm/vpx_convolve_neon.c @@ -130,6 +148,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c +DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c + # loop filters DSP_SRCS-yes += loopfilter.c @@ -137,15 +157,11 @@ DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/loopfilter_neon.c -DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM) DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM) DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM) else -ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/loopfilter_mb_neon.c -endif # HAVE_NEON +DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c endif # HAVE_NEON_ASM DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h @@ -161,6 +177,7 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c endif # CONFIG_VP9_HIGHBITDEPTH @@ -180,6 +197,10 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h +DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c @@ -193,48 +214,64 @@ DSP_SRCS-yes += inv_txfm.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm -endif # ARCH_X86_64 +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c -ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/save_reg_neon$(ASM) -DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) -DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM) -else -ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/idct4x4_1_add_neon.c -DSP_SRCS-yes += arm/idct4x4_add_neon.c -DSP_SRCS-yes += arm/idct8x8_1_add_neon.c -DSP_SRCS-yes += arm/idct8x8_add_neon.c -DSP_SRCS-yes += arm/idct16x16_1_add_neon.c -DSP_SRCS-yes += arm/idct16x16_add_neon.c -DSP_SRCS-yes += arm/idct32x32_1_add_neon.c -DSP_SRCS-yes += arm/idct32x32_add_neon.c -endif # HAVE_NEON -endif # HAVE_NEON_ASM -DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c +DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) +DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c DSP_SRCS-$(HAVE_MSA) += mips/idct16x16_msa.c DSP_SRCS-$(HAVE_MSA) += mips/idct32x32_msa.c -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c -endif # CONFIG_VP9_HIGHBITDEPTH +else # CONFIG_VP9_HIGHBITDEPTH +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct16x16_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/idct_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) +else +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c +endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c + endif # CONFIG_VP9 # quantization @@ -242,32 +279,39 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += quantize.c DSP_SRCS-yes += quantize.h +DSP_SRCS-$(HAVE_SSE2) += x86/quantize_x86.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c +DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c +DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c endif -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm -DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm -endif # avg DSP_SRCS-yes += avg.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c -DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm endif +DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c endif # CONFIG_VP9_ENCODER +# skin detection +DSP_SRCS-yes += skin_detection.h +DSP_SRCS-yes += skin_detection.c + ifeq ($(CONFIG_ENCODERS),yes) DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c DSP_SRCS-yes += sum_squares.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c +DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c @@ -276,11 +320,15 @@ DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c +DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c +DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c + DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c +DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm @@ -288,6 +336,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm +DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm @@ -299,16 +349,20 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC) DSP_SRCS-yes += variance.c DSP_SRCS-yes += variance.h +DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c +DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c + DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c -DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c +DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm @@ -325,7 +379,19 @@ endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC # Neon utilities +DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h + +# PPC VSX utilities +DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h + +# X86 utilities +DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_common.h b/libs/libvpx/vpx_dsp/vpx_dsp_common.h index 49d36e5458..c8c852374f 100644 --- a/libs/libvpx/vpx_dsp/vpx_dsp_common.h +++ b/libs/libvpx/vpx_dsp/vpx_dsp_common.h @@ -43,6 +43,8 @@ typedef int32_t tran_high_t; typedef int16_t tran_low_t; #endif // CONFIG_VP9_HIGHBITDEPTH +typedef int16_t tran_coef_t; + static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255 : (val < 0) ? 0 : val; } @@ -55,7 +57,6 @@ static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } -#if CONFIG_VP9_HIGHBITDEPTH static INLINE uint16_t clip_pixel_highbd(int val, int bd) { switch (bd) { case 8: @@ -64,7 +65,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) { case 12: return (uint16_t)clamp(val, 0, 4095); } } -#endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index 9fea2d1cf2..1a743d910e 100644 --- a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vpx_dsp_forward_decls() { print < + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_ports/mem.h" + +static void hadamard_col8x2_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi16(a0, a1); + __m256i b1 = _mm256_sub_epi16(a0, a1); + __m256i b2 = _mm256_add_epi16(a2, a3); + __m256i b3 = _mm256_sub_epi16(a2, a3); + __m256i b4 = _mm256_add_epi16(a4, a5); + __m256i b5 = _mm256_sub_epi16(a4, a5); + __m256i b6 = _mm256_add_epi16(a6, a7); + __m256i b7 = _mm256_sub_epi16(a6, a7); + + a0 = _mm256_add_epi16(b0, b2); + a1 = _mm256_add_epi16(b1, b3); + a2 = _mm256_sub_epi16(b0, b2); + a3 = _mm256_sub_epi16(b1, b3); + a4 = _mm256_add_epi16(b4, b6); + a5 = _mm256_add_epi16(b5, b7); + a6 = _mm256_sub_epi16(b4, b6); + a7 = _mm256_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi16(a0, a4); + b7 = _mm256_add_epi16(a1, a5); + b3 = _mm256_add_epi16(a2, a6); + b4 = _mm256_add_epi16(a3, a7); + b2 = _mm256_sub_epi16(a0, a4); + b6 = _mm256_sub_epi16(a1, a5); + b1 = _mm256_sub_epi16(a2, a6); + b5 = _mm256_sub_epi16(a3, a7); + + a0 = _mm256_unpacklo_epi16(b0, b1); + a1 = _mm256_unpacklo_epi16(b2, b3); + a2 = _mm256_unpackhi_epi16(b0, b1); + a3 = _mm256_unpackhi_epi16(b2, b3); + a4 = _mm256_unpacklo_epi16(b4, b5); + a5 = _mm256_unpacklo_epi16(b6, b7); + a6 = _mm256_unpackhi_epi16(b4, b5); + a7 = _mm256_unpackhi_epi16(b6, b7); + + b0 = _mm256_unpacklo_epi32(a0, a1); + b1 = _mm256_unpacklo_epi32(a4, a5); + b2 = _mm256_unpackhi_epi32(a0, a1); + b3 = _mm256_unpackhi_epi32(a4, a5); + b4 = _mm256_unpacklo_epi32(a2, a3); + b5 = _mm256_unpacklo_epi32(a6, a7); + b6 = _mm256_unpackhi_epi32(a2, a3); + b7 = _mm256_unpackhi_epi32(a6, a7); + + in[0] = _mm256_unpacklo_epi64(b0, b1); + in[1] = _mm256_unpackhi_epi64(b0, b1); + in[2] = _mm256_unpacklo_epi64(b2, b3); + in[3] = _mm256_unpackhi_epi64(b2, b3); + in[4] = _mm256_unpacklo_epi64(b4, b5); + in[5] = _mm256_unpackhi_epi64(b4, b5); + in[6] = _mm256_unpacklo_epi64(b6, b7); + in[7] = _mm256_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm256_add_epi16(a0, a4); + in[7] = _mm256_add_epi16(a1, a5); + in[3] = _mm256_add_epi16(a2, a6); + in[4] = _mm256_add_epi16(a3, a7); + in[2] = _mm256_sub_epi16(a0, a4); + in[6] = _mm256_sub_epi16(a1, a5); + in[1] = _mm256_sub_epi16(a2, a6); + in[5] = _mm256_sub_epi16(a3, a7); + } +} + +static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + __m256i src[8]; + src[0] = _mm256_loadu_si256((const __m256i *)src_diff); + src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + + hadamard_col8x2_avx2(src, 0); + hadamard_col8x2_avx2(src, 1); + + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x31)); +} + +void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + + for (idx = 0; idx < 2; ++idx) { + int16_t const *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + + coeff += 16; + t_coeff += 16; + } +} + +int vpx_satd_avx2(const tran_low_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 16) { + const __m256i src_line = load_tran_low(coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} diff --git a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c index b0a104bad0..a235ba41df 100644 --- a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -11,6 +11,8 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_ports/mem.h" void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, @@ -212,8 +214,8 @@ static void hadamard_col8_sse2(__m128i *in, int iter) { } } -void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); @@ -227,25 +229,25 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); - _mm_store_si128((__m128i *)coeff, src[0]); + store_tran_low(src[0], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); + store_tran_low(src[1], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); + store_tran_low(src[2], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); + store_tran_low(src[3], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); + store_tran_low(src[4], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); + store_tran_low(src[5], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); + store_tran_low(src[6], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); + store_tran_low(src[7], coeff); } -void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { int16_t const *src_ptr = @@ -254,10 +256,10 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, } for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); + __m128i coeff0 = load_tran_low(coeff); + __m128i coeff1 = load_tran_low(coeff + 64); + __m128i coeff2 = load_tran_low(coeff + 128); + __m128i coeff3 = load_tran_low(coeff + 192); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); @@ -271,25 +273,25 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); coeff += 8; } } -int vpx_satd_sse2(const int16_t *coeff, int length) { +int vpx_satd_sse2(const tran_low_t *coeff, int length) { int i; const __m128i zero = _mm_setzero_si128(); __m128i accum = zero; for (i = 0; i < length; i += 8) { - const __m128i src_line = _mm_load_si128((const __m128i *)coeff); + const __m128i src_line = load_tran_low(coeff); const __m128i inv = _mm_sub_epi16(zero, src_line); const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); diff --git a/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c new file mode 100644 index 0000000000..f83b26490e --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + /* comp and pred must be 16 byte aligned. */ + assert(((intptr_t)comp & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width > 8) { + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; x += 16) { + const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); + const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); + const __m128i avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)(comp + x), avg); + } + comp += width; + pred += width; + ref += ref_stride; + } + } else { // width must be 4 or 8. + int i; + // Process 16 elements at a time. comp and pred have width == stride and + // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all + // divisible by 16 so just ref needs to be massaged when loading. + for (i = 0; i < width * height; i += 16) { + const __m128i p = _mm_load_si128((const __m128i *)pred); + __m128i r; + __m128i avg; + if (width == ref_stride) { + r = _mm_loadu_si128((const __m128i *)ref); + ref += 16; + } else if (width == 4) { + r = _mm_set_epi32(*(const uint32_t *)(ref + 3 * ref_stride), + *(const uint32_t *)(ref + 2 * ref_stride), + *(const uint32_t *)(ref + ref_stride), + *(const uint32_t *)(ref)); + + ref += 4 * ref_stride; + } else { + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + assert(width == 8); + r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0), + (const __m64 *)(ref + ref_stride))); + + ref += 2 * ref_stride; + } + avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)comp, avg); + + pred += 16; + comp += 16; + } + } +} diff --git a/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm index 26412e8e43..22e0a086cc 100644 --- a/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm +++ b/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm @@ -8,42 +8,50 @@ ; be found in the AUTHORS file in the root of the source tree. ; -%define private_prefix vpx - %include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the hadamard transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text %if ARCH_X86_64 ; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro +%macro TRANSPOSE8X8 10 + ; stage 1 + punpcklwd m%9, m%1, m%2 + punpcklwd m%10, m%3, m%4 + punpckhwd m%1, m%2 + punpckhwd m%3, m%4 -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 + punpcklwd m%2, m%5, m%6 + punpcklwd m%4, m%7, m%8 + punpckhwd m%5, m%6 + punpckhwd m%7, m%8 - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 + ; stage 2 + punpckldq m%6, m%9, m%10 + punpckldq m%8, m%1, m%3 + punpckhdq m%9, m%10 + punpckhdq m%1, m%3 - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 + punpckldq m%10, m%2, m%4 + punpckldq m%3, m%5, m%7 + punpckhdq m%2, m%4 + punpckhdq m%5, m%7 - SWAP %2, %5 - SWAP %4, %7 + ; stage 3 + punpckhqdq m%4, m%9, m%2 ; out3 + punpcklqdq m%9, m%2 ; out2 + punpcklqdq m%7, m%1, m%5 ; out6 + punpckhqdq m%1, m%5 ; out7 + + punpckhqdq m%2, m%6, m%10 ; out1 + punpcklqdq m%6, m%10 ; out0 + punpcklqdq m%5, m%8, m%3 ; out4 + punpckhqdq m%8, m%3 ; out5 + + SWAP %6, %1 + SWAP %3, %9 + SWAP %8, %6 %endmacro %macro HMD8_1D 0 @@ -87,8 +95,9 @@ SECTION .text SWAP 7, 9 %endmacro + INIT_XMM ssse3 -cglobal hadamard_8x8, 3, 5, 10, input, stride, output +cglobal hadamard_8x8, 3, 5, 11, input, stride, output lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -105,17 +114,17 @@ cglobal hadamard_8x8, 3, 5, 10, input, stride, output mova m7, [inputq + r3] HMD8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 HMD8_1D - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 + STORE_TRAN_LOW 0, outputq, 0, 8, 9 + STORE_TRAN_LOW 1, outputq, 8, 8, 9 + STORE_TRAN_LOW 2, outputq, 16, 8, 9 + STORE_TRAN_LOW 3, outputq, 24, 8, 9 + STORE_TRAN_LOW 4, outputq, 32, 8, 9 + STORE_TRAN_LOW 5, outputq, 40, 8, 9 + STORE_TRAN_LOW 6, outputq, 48, 8, 9 + STORE_TRAN_LOW 7, outputq, 56, 8, 9 RET %endif diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h new file mode 100644 index 0000000000..3552c07cd3 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ +#define VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 16 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m256i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); + const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8)); + return _mm256_packs_epi32(a_low, a_high); +#else + return _mm256_loadu_si256((const __m256i *)a); +#endif +} + +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_hi = _mm256_mulhi_epi16(a, one); + const __m256i a_lo = _mm256_mullo_epi16(a, one); + const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); + const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); + _mm256_storeu_si256((__m256i *)b, a_1); + _mm256_storeu_si256((__m256i *)(b + 8), a_2); +#else + _mm256_storeu_si256((__m256i *)b, a); +#endif +} +#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm new file mode 100644 index 0000000000..aacf71f7ac --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm @@ -0,0 +1,90 @@ +; +; Copyright (c) 2017 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm. +; vpx_config.asm is not guarded so can not be included twice. Because this will +; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be +; included after those files. + +; Increment register by sizeof() tran_low_t * 8. +%macro INCREMENT_TRAN_LOW 1 +%if CONFIG_VP9_HIGHBITDEPTH + add %1, 32 +%else + add %1, 16 +%endif +%endmacro + +; Increment %1 by sizeof() tran_low_t * %2. +%macro INCREMENT_ELEMENTS_TRAN_LOW 2 +%if CONFIG_VP9_HIGHBITDEPTH + lea %1, [%1 + %2 * 4] +%else + lea %1, [%1 + %2 * 2] +%endif +%endmacro + +; Load %2 + %3 into m%1. +; %3 is the offset in elements, not bytes. +; If tran_low_t is 16 bits (low bit depth configuration) then load the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack +; the values down to 16 bits. +%macro LOAD_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] +%else + mova m%1, [%2 + (%3) * 2] +%endif +%endmacro + +; Store m%1 to %2 + %3. +; %3 is the offset in elements, not bytes. +; If 5 arguments are provided then m%1 is corrupted. +; If 6 arguments are provided then m%1 is preserved. +; If tran_low_t is 16 bits (low bit depth configuration) then store the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign +; extend the values first. +; Uses m%4-m%6 as scratch registers for high bit depth. +%macro STORE_TRAN_LOW 5-6 +%if CONFIG_VP9_HIGHBITDEPTH + pxor m%4, m%4 + mova m%5, m%1 + %if %0 == 6 + mova m%6, m%1 + %endif + pcmpgtw m%4, m%1 + punpcklwd m%5, m%4 + %if %0 == 5 + punpckhwd m%1, m%4 + %else + punpckhwd m%6, m%4 + %endif + mova [%2 + (%3) * 4 + 0], m%5 + %if %0 == 5 + mova [%2 + (%3) * 4 + 16], m%1 + %else + mova [%2 + (%3) * 4 + 16], m%6 + %endif +%else + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro + +; Store zeros (in m%1) to %2 + %3. +; %3 is the offset in elements, not bytes. +%macro STORE_ZERO_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova [%2 + (%3) * 4 + 0], m%1 + mova [%2 + (%3) * 4 + 16], m%1 +%else + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h new file mode 100644 index 0000000000..5d1d779572 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ +#define VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m128i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); +#else + return _mm_load_si128((const __m128i *)a); +#endif +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(__m128i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i one = _mm_set1_epi16(1); + const __m128i a_hi = _mm_mulhi_epi16(a, one); + const __m128i a_lo = _mm_mullo_epi16(a, one); + const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi); + const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi); + _mm_store_si128((__m128i *)(b), a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +#else + _mm_store_si128((__m128i *)(b), a); +#endif +} + +// Zero fill 8 positions in the output buffer. +static INLINE void store_zero_tran_low(tran_low_t *a) { + const __m128i zero = _mm_setzero_si128(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(a), zero); + _mm_store_si128((__m128i *)(a + 4), zero); +#else + _mm_store_si128((__m128i *)(a), zero); +#endif +} +#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/convolve.h b/libs/libvpx/vpx_dsp/x86/convolve.h index 2a0516cdc1..68d7589d45 100644 --- a/libs/libvpx/vpx_dsp/x86/convolve.h +++ b/libs/libvpx/vpx_dsp/x86/convolve.h @@ -20,11 +20,16 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ +#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ void vpx_convolve8_##name##_##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ + ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + const int16_t *filter = filter_kernel[offset]; \ + (void)x0_q4; \ + (void)x_step_q4; \ + (void)y0_q4; \ + (void)y_step_q4; \ assert(filter[3] != 128); \ assert(step_q4 == 16); \ if (filter[0] | filter[1] | filter[2]) { \ @@ -60,32 +65,36 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, } \ } -#define FUN_CONV_2D(avg, opt) \ - void vpx_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - assert(filter_x[3] != 128); \ - assert(filter_y[3] != 128); \ - assert(w <= 64); \ - assert(h <= 64); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - if (filter_x[0] | filter_x[1] | filter_x[2]) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ - vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h + 7); \ - vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ - vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 1); \ - vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h); \ - } \ +#define FUN_CONV_2D(avg, opt) \ + void vpx_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + const int16_t *filter_x = filter[x0_q4]; \ + const int16_t *filter_y = filter[y0_q4]; \ + (void)filter_y; \ + assert(filter_x[3] != 128); \ + assert(filter_y[3] != 128); \ + assert(w <= 64); \ + assert(h <= 64); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + if (filter_x[0] | filter_x[1] | filter_x[2]) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ + h + 7); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ + vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ + x_step_q4, y0_q4, y_step_q4, w, h + 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ + h); \ + } \ } #if CONFIG_VP9_HIGHBITDEPTH @@ -97,97 +106,97 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, unsigned int output_height, const int16_t *filter, int bd); -#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vpx_highbd_convolve8_##name##_##opt( \ - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h, bd); \ - } \ +#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ + void vpx_highbd_convolve8_##name##_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter = filter_kernel[offset]; \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_kernel, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h, bd); \ + } \ } -#define HIGH_FUN_CONV_2D(avg, opt) \ - void vpx_highbd_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } \ - } else { \ - vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ +#define HIGH_FUN_CONV_2D(avg, opt) \ + void vpx_highbd_convolve8_##avg##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_x = filter[x0_q4]; \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + fdata2, 64, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, \ + w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ + bd); \ + } \ } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vpx_dsp/x86/convolve_avx2.h b/libs/libvpx/vpx_dsp/x86/convolve_avx2.h new file mode 100644 index 0000000000..bc96b738f4 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/convolve_avx2.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_CONVOLVE_AVX2_H_ +#define VPX_DSP_X86_CONVOLVE_AVX2_H_ + +#include // AVX2 + +#include "./vpx_config.h" + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static INLINE void shuffle_filter_avx2(const int16_t *const filter, + __m256i *const f) { + const __m256i f_values = + MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter)); + // pack and duplicate the filter values + f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u)); + f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u)); + f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u)); + f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE __m256i convolve8_16_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m256i k_64 = _mm256_set1_epi16(1 << 6); + const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]); + const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); + const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); + const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); + __m256i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm256_add_epi16(x0, x2); + sum2 = _mm256_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm256_add_epi16(sum1, k_64); + sum1 = _mm256_adds_epi16(sum1, sum2); + // round and shift by 7 bit each 16 bit + sum1 = _mm256_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m128i convolve8_8_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]), + _mm256_castsi256_si128(f[0])); + const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]), + _mm256_castsi256_si128(f[1])); + const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]), + _mm256_castsi256_si128(f[2])); + const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]), + _mm256_castsi256_si128(f[3])); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +#undef MM256_BROADCASTSI128_SI256 + +#endif // VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h b/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h new file mode 100644 index 0000000000..e5d452f99e --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#define VPX_DSP_X86_CONVOLVE_SSSE3_H_ + +#include +#include // SSSE3 + +#include "./vpx_config.h" + +static INLINE void shuffle_filter_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); +} + +static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + // It utilizes the fact that the high byte of filter[3] is always 0 to clean + // half of f[0] and f[4]. + assert(filter[3] >= 0 && filter[3] < 256); + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); + f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); +} + +static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + // compensate the subtracted 64 in f[1]. x4 is always non negative. + const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); + // add and saturate the results together + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x4); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + // compensate the subtracted 64 in f[2]. x5 is always non negative. + const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); + __m128i temp; + + // add and saturate the results together + temp = _mm_adds_epi16(x0, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x3); + temp = _mm_adds_epi16(temp, x4); + temp = _mm_adds_epi16(temp, x5); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +#endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_ diff --git a/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm b/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm index 6df360df44..97cb43b671 100644 --- a/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm @@ -78,11 +78,13 @@ %endmacro %macro UPDATE_FLIMIT 0 - movdqa xmm2, XMMWORD PTR [rbx] - movdqa [rsp], xmm2 + movdqu xmm2, XMMWORD PTR [rbx] + movdqu [rsp], xmm2 add rbx, 16 %endmacro +SECTION .text + ;void vpx_post_proc_down_and_across_mb_row_sse2 ;( ; unsigned char *src_ptr, @@ -230,11 +232,11 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2): ret %undef flimit -;void vpx_mbpost_proc_down_xmm(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) +;void vpx_mbpost_proc_down_sse2(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) extern sym(vpx_rv) -global sym(vpx_mbpost_proc_down_xmm) PRIVATE -sym(vpx_mbpost_proc_down_xmm): +global sym(vpx_mbpost_proc_down_sse2) PRIVATE +sym(vpx_mbpost_proc_down_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -462,10 +464,10 @@ sym(vpx_mbpost_proc_down_xmm): %undef flimit4 -;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src, -; int pitch, int rows, int cols,int flimit) -global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE -sym(vpx_mbpost_proc_across_ip_xmm): +;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, +; int pitch, int rows, int cols,int flimit) +global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE +sym(vpx_mbpost_proc_across_ip_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 diff --git a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h index 39d3a3f59c..132e065239 100644 --- a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h +++ b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -51,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64); const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); diff --git a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h index 3744333909..32b9bd2813 100644 --- a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -63,7 +63,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h index 743e55e635..f9abaecf28 100644 --- a/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h @@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm index 78a1dbb24f..32824a03a3 100644 --- a/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -10,10 +10,6 @@ %include "third_party/x86inc/x86inc.asm" -; This file provides SSSE3 version of the forward transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - SECTION_RODATA pw_11585x2: times 8 dw 23170 @@ -32,107 +28,11 @@ TRANSFORM_COEFFS 9102, 13623 SECTION .text %if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -; 1D forward 8x8 DCT transform -%macro FDCT8_1D 1 - SUM_SUB 0, 7, 9 - SUM_SUB 1, 6, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 3, 4, 9 - - SUM_SUB 0, 3, 9 - SUM_SUB 1, 2, 9 - SUM_SUB 6, 5, 9 -%if %1 == 0 - SUM_SUB 0, 1, 9 -%endif - - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 - - pmulhrsw m6, m12 - pmulhrsw m5, m12 -%if %1 == 0 - pmulhrsw m0, m12 - pmulhrsw m1, m12 -%else - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 - SWAP 0, 1 -%endif - - SUM_SUB 4, 5, 9 - SUM_SUB 7, 6, 9 - BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 - SWAP 1, 4 - SWAP 3, 6 -%endmacro - -%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 - psraw m%3, m%1, 15 - psraw m%4, m%2, 15 - psubw m%1, m%3 - psubw m%2, m%4 - psraw m%1, 1 - psraw m%2, 1 -%endmacro - INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m12, [pw_11585x2] - pxor m11, m11 + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -159,25 +59,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride psllw m7, 2 ; column transform - FDCT8_1D 0 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 - FDCT8_1D 1 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + paddw m9, m1, m6 + psubw m1, m6 - DIVIDE_ROUND_2X 0, 1, 9, 10 - DIVIDE_ROUND_2X 2, 3, 9, 10 - DIVIDE_ROUND_2X 4, 5, 9, 10 - DIVIDE_ROUND_2X 6, 7, 9, 10 + paddw m7, m2, m5 + psubw m2, m5 - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + mova [outputq + 0], m4 + mova [outputq + 16], m5 + mova [outputq + 32], m3 + mova [outputq + 48], m0 + mova [outputq + 64], m10 + mova [outputq + 80], m9 + mova [outputq + 96], m7 + mova [outputq + 112], m2 RET %endif diff --git a/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 0000000000..7e75d5d10c --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,1108 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" + +// ----------------------------------------------------------------------------- +// Copy and average + +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int width, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + _mm256_storeu_si256((__m256i *)(dst + 32), p2); + _mm256_storeu_si256((__m256i *)(dst + 48), p3); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + p1 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + + _mm256_storeu_si256((__m256i *)dst, p0); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + p1 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + + _mm_storeu_si128((__m128i *)dst, p0); + dst += dst_stride; + _mm_storeu_si128((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + p1 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + + _mm_storel_epi64((__m128i *)dst, p0); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } +} + +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int width, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + __m256i p0, p1, p2, p3, u0, u1, u2, u3; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); + u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); + _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); + + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + dst_stride), + _mm256_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadu_si128((const __m128i *)dst); + u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); + + _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); + _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadl_epi64((const __m128i *)dst); + u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); + + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); + _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } +} + +// ----------------------------------------------------------------------------- +// Horizontal and vertical filtering + +#define CONV8_ROUNDING_BITS (7) + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, + __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + { + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + } + { + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + } + { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + } +} + +static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void vpx_highbd_filter_block1d8_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void vpx_highbd_filter_block1d8_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static void vpx_highbd_filter_block1d8_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + { + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + } +} + +static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void vpx_highbd_filter_block1d16_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void vpx_highbd_filter_block1d16_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void vpx_highbd_filter_block1d8_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +// Calculation with averaging the input pixels + +static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y0); + const __m128i a1 = _mm256_extractf128_si256(*y0, 1); + __m128i res = _mm_packus_epi32(a0, a1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); + const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); + const __m256i pix = + _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); + const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); + __m256i p = _mm256_min_epi16(*y0, *mask); + p = _mm256_avg_epu16(p, pix0); + _mm256_storeu_si256((__m256i *)dst, p); + + p = _mm256_min_epi16(*y1, *mask); + p = _mm256_avg_epu16(p, pix1); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, + const __m128i *y1, + const __m128i *mask, + uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, *mask); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void vpx_highbd_filter_block1d8_h8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_v8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d16_v8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_h2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d16_v2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_v2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 +#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 +#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 +#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 + +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); +HIGH_FUN_CONV_2D(, avx2); + +void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +#define vpx_highbd_filter_block1d4_h8_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_h2_avg_avx2 \ + vpx_highbd_filter_block1d4_h2_avg_sse2 +#define vpx_highbd_filter_block1d4_v8_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_v2_avg_avx2 \ + vpx_highbd_filter_block1d4_v2_avg_sse2 + +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, + avx2); +HIGH_FUN_CONV_2D(avg_, avx2); + +#undef HIGHBD_FUNC diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c new file mode 100644 index 0000000000..f4f7235d13 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]); + out[14] = in[14]; + out[15] = in[15]; +} + +static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]); + highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2], sign[2]; + + // stage 2 + highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + abs_extend_64bit_sse2(io[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + step2[1] = step2[0]; + highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2], sign[2]; + + // stage 2 + highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10] + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13] + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + abs_extend_64bit_sse2(io[0], temp, sign); + step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + idct16_8col(in, in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + highbd_idct16_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct16_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], temp[16]; + + highbd_load_pack_transpose_32bit_8x8(input, 16, in); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(input, 16, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_4x4(input, 16, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 16); +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c new file mode 100644 index 0000000000..de097c66a6 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]); + out[14] = in[14]; + out[15] = in[15]; +} + +static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]); + highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2]; + + // stage 2 + highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + extend_64bit(io[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + step2[1] = step2[0]; + highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2]; + + // stage 2 + highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + extend_64bit(io[0], temp); + step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + idct16_8col(in, in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + highbd_idct16_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct16_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], temp[16]; + + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(input, 16, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_4x4(input, 16, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c new file mode 100644 index 0000000000..c710e89954 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c @@ -0,0 +1,782 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi32(step2[8], step2[11]); + step1[9] = _mm_add_epi32(step2[9], step2[10]); + step1[10] = _mm_sub_epi32(step2[9], step2[10]); + step1[11] = _mm_sub_epi32(step2[8], step2[11]); + step1[12] = _mm_sub_epi32(step2[15], step2[12]); + step1[13] = _mm_sub_epi32(step2[14], step2[13]); + step1[14] = _mm_add_epi32(step2[14], step2[13]); + step1[15] = _mm_add_epi32(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64, + &out[10], &out[13]); + highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64, + &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi32(step1[16], step1[19]); + step2[17] = _mm_add_epi32(step1[17], step1[18]); + step2[18] = _mm_sub_epi32(step1[17], step1[18]); + step2[19] = _mm_sub_epi32(step1[16], step1[19]); + step2[20] = _mm_sub_epi32(step1[20], step1[23]); // step2[20] = -step2[20] + step2[21] = _mm_sub_epi32(step1[21], step1[22]); // step2[21] = -step2[21] + step2[22] = _mm_add_epi32(step1[21], step1[22]); + step2[23] = _mm_add_epi32(step1[20], step1[23]); + + step2[24] = _mm_add_epi32(step1[27], step1[24]); + step2[25] = _mm_add_epi32(step1[26], step1[25]); + step2[26] = _mm_sub_epi32(step1[26], step1[25]); // step2[26] = -step2[26] + step2[27] = _mm_sub_epi32(step1[27], step1[24]); // step2[27] = -step2[27] + step2[28] = _mm_sub_epi32(step1[31], step1[28]); + step2[29] = _mm_sub_epi32(step1[30], step1[29]); + step2[30] = _mm_add_epi32(step1[29], step1[30]); + step2[31] = _mm_add_epi32(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64, + &step1[18], &step1[29]); + highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64, + &step1[19], &step1[28]); + highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64, + &step1[27], &step1[20]); + highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64, + &step1[26], &step1[21]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[16] = _mm_add_epi32(step1[16], step1[23]); + step2[17] = _mm_add_epi32(step1[17], step1[22]); + step2[18] = _mm_add_epi32(step1[18], step1[21]); + step2[19] = _mm_add_epi32(step1[19], step1[20]); + step2[20] = _mm_sub_epi32(step1[19], step1[20]); + step2[21] = _mm_sub_epi32(step1[18], step1[21]); + step2[22] = _mm_sub_epi32(step1[17], step1[22]); + step2[23] = _mm_sub_epi32(step1[16], step1[23]); + + step2[24] = _mm_sub_epi32(step1[31], step1[24]); + step2[25] = _mm_sub_epi32(step1[30], step1[25]); + step2[26] = _mm_sub_epi32(step1[29], step1[26]); + step2[27] = _mm_sub_epi32(step1[28], step1[27]); + step2[28] = _mm_add_epi32(step1[27], step1[28]); + step2[29] = _mm_add_epi32(step1[26], step1[29]); + step2[30] = _mm_add_epi32(step1[25], step1[30]); + step2[31] = _mm_add_epi32(step1[24], step1[31]); + + // stage 7 + out[16] = step2[16]; + out[17] = step2[17]; + out[18] = step2[18]; + out[19] = step2[19]; + highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64, + &out[20], &out[27]); + highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64, + &out[21], &out[26]); + highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64, + &out[22], &out[25]); + highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64, + &out[23], &out[24]); + out[28] = step2[28]; + out[29] = step2[29]; + out[30] = step2[30]; + out[31] = step2[31]; +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_1024_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_1024_4x32_quarter_1(in, temp); + highbd_idct32_1024_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], + &step1[30]); + highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], + &step1[26]); + + highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18] + step2[19] = _mm_add_epi32(step1[18], step1[19]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22] + step2[23] = _mm_add_epi32(step1[22], step1[23]); + + step2[24] = _mm_add_epi32(step1[25], step1[24]); + step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25] + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[29], step1[28]); + step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29] + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_1024_4x32_quarter_1_2(io, temp); + highbd_idct32_1024_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[4][32], io[32]; + + // rows + for (i = 0; i < 4; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]); + highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]); + highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + idct32_1024_8x32(io, io); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, io[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 8; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]); + highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]); + highbd_idct32_1024_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + transpose_32bit_4x4(all[4] + i, out + 16); + transpose_32bit_4x4(all[5] + i, out + 20); + transpose_32bit_4x4(all[6] + i, out + 24); + transpose_32bit_4x4(all[7] + i, out + 28); + highbd_idct32_1024_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_135_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_135_4x32_quarter_1(in, temp); + highbd_idct32_135_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64, + &step1[17], &step1[30]); + highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64, + &step1[21], &step1[26]); + + highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18] + step2[19] = _mm_add_epi32(step1[18], step1[19]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22] + step2[23] = _mm_add_epi32(step1[22], step1[23]); + + step2[24] = _mm_add_epi32(step1[25], step1[24]); + step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25] + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[29], step1[28]); + step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29] + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_135_4x32_quarter_1_2(io, temp); + highbd_idct32_135_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[2][32], in[32], out[32]; + + for (i = 16; i < 32; i++) { + in[i] = _mm_setzero_si128(); + } + + // rows + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]); + idct32_1024_8x32(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_1024_8x32(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_135_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_135_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + + // stage 4 + highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step1[10]); // step1[10] = -step1[10] + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step1[13]); // step1[13] = -step1[13] + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_34_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_34_4x32_quarter_1(in, temp); + highbd_idct32_34_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + + // stage 3 + step2[18] = + _mm_sub_epi32(_mm_setzero_si128(), step2[18]); // step2[18] = -step2[18] + step2[22] = + _mm_sub_epi32(_mm_setzero_si128(), step2[22]); // step2[22] = -step2[22] + step2[25] = + _mm_sub_epi32(_mm_setzero_si128(), step2[25]); // step2[25] = -step2[25] + step2[29] = + _mm_sub_epi32(_mm_setzero_si128(), step2[29]); // step2[29] = -step2[29] + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_34_4x32_quarter_1_2(io, temp); + highbd_idct32_34_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[32], in[32], out[32]; + + // rows + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + idct32_34_8x32_sse2(in, col); + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col + i, in); + idct32_34_8x32_sse2(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_34_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_34_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 32); +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c new file mode 100644 index 0000000000..2d0a53ac0a --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c @@ -0,0 +1,765 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64, + &step2[10], &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi32(step2[8], step2[11]); + step1[9] = _mm_add_epi32(step2[9], step2[10]); + step1[10] = _mm_sub_epi32(step2[9], step2[10]); + step1[11] = _mm_sub_epi32(step2[8], step2[11]); + step1[12] = _mm_sub_epi32(step2[15], step2[12]); + step1[13] = _mm_sub_epi32(step2[14], step2[13]); + step1[14] = _mm_add_epi32(step2[14], step2[13]); + step1[15] = _mm_add_epi32(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64, + &out[10], &out[13]); + highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64, + &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi32(step1[16], step1[19]); + step2[17] = _mm_add_epi32(step1[17], step1[18]); + step2[18] = _mm_sub_epi32(step1[17], step1[18]); + step2[19] = _mm_sub_epi32(step1[16], step1[19]); + step2[20] = _mm_sub_epi32(step1[23], step1[20]); + step2[21] = _mm_sub_epi32(step1[22], step1[21]); + step2[22] = _mm_add_epi32(step1[22], step1[21]); + step2[23] = _mm_add_epi32(step1[23], step1[20]); + + step2[24] = _mm_add_epi32(step1[24], step1[27]); + step2[25] = _mm_add_epi32(step1[25], step1[26]); + step2[26] = _mm_sub_epi32(step1[25], step1[26]); + step2[27] = _mm_sub_epi32(step1[24], step1[27]); + step2[28] = _mm_sub_epi32(step1[31], step1[28]); + step2[29] = _mm_sub_epi32(step1[30], step1[29]); + step2[30] = _mm_add_epi32(step1[29], step1[30]); + step2[31] = _mm_add_epi32(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64, + &step1[18], &step1[29]); + highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64, + &step1[19], &step1[28]); + highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64, + &step1[20], &step1[27]); + highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64, + &step1[21], &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[16] = _mm_add_epi32(step1[16], step1[23]); + step2[17] = _mm_add_epi32(step1[17], step1[22]); + step2[18] = _mm_add_epi32(step1[18], step1[21]); + step2[19] = _mm_add_epi32(step1[19], step1[20]); + step2[20] = _mm_sub_epi32(step1[19], step1[20]); + step2[21] = _mm_sub_epi32(step1[18], step1[21]); + step2[22] = _mm_sub_epi32(step1[17], step1[22]); + step2[23] = _mm_sub_epi32(step1[16], step1[23]); + + step2[24] = _mm_sub_epi32(step1[31], step1[24]); + step2[25] = _mm_sub_epi32(step1[30], step1[25]); + step2[26] = _mm_sub_epi32(step1[29], step1[26]); + step2[27] = _mm_sub_epi32(step1[28], step1[27]); + step2[28] = _mm_add_epi32(step1[27], step1[28]); + step2[29] = _mm_add_epi32(step1[26], step1[29]); + step2[30] = _mm_add_epi32(step1[25], step1[30]); + step2[31] = _mm_add_epi32(step1[24], step1[31]); + + // stage 7 + out[16] = step2[16]; + out[17] = step2[17]; + out[18] = step2[18]; + out[19] = step2[19]; + highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64, + &out[20], &out[27]); + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64, + &out[21], &out[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64, + &out[22], &out[25]); + highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64, + &out[23], &out[24]); + out[28] = step2[28]; + out[29] = step2[29]; + out[30] = step2[30]; + out[31] = step2[31]; +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_1024_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_1024_4x32_quarter_1(in, temp); + highbd_idct32_1024_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], + &step1[30]); + highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], + &step1[26]); + + highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[19], step1[18]); + step2[19] = _mm_add_epi32(step1[19], step1[18]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[23], step1[22]); + step2[23] = _mm_add_epi32(step1[23], step1[22]); + + step2[24] = _mm_add_epi32(step1[24], step1[25]); + step2[25] = _mm_sub_epi32(step1[24], step1[25]); + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[28], step1[29]); + step2[29] = _mm_sub_epi32(step1[28], step1[29]); + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_1024_4x32_quarter_1_2(io, temp); + highbd_idct32_1024_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[4][32], io[32]; + + // rows + for (i = 0; i < 4; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]); + highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]); + highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + idct32_1024_8x32(io, io); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, io[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 8; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]); + highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]); + highbd_idct32_1024_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + transpose_32bit_4x4(all[4] + i, out + 16); + transpose_32bit_4x4(all[5] + i, out + 20); + transpose_32bit_4x4(all[6] + i, out + 24); + transpose_32bit_4x4(all[7] + i, out + 28); + highbd_idct32_1024_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_135_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_135_4x32_quarter_1(in, temp); + highbd_idct32_135_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17], + &step1[30]); + highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21], + &step1[26]); + + highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[19], step1[18]); + step2[19] = _mm_add_epi32(step1[19], step1[18]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[23], step1[22]); + step2[23] = _mm_add_epi32(step1[23], step1[22]); + + step2[24] = _mm_add_epi32(step1[24], step1[25]); + step2[25] = _mm_sub_epi32(step1[24], step1[25]); + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[28], step1[29]); + step2[29] = _mm_sub_epi32(step1[28], step1[29]); + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_135_4x32_quarter_1_2(io, temp); + highbd_idct32_135_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[2][32], in[32], out[32]; + + // rows + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]); + idct32_135_8x32_ssse3(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_135_8x32_ssse3(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_135_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_135_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + + // stage 4 + highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_34_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_34_4x32_quarter_1(in, temp); + highbd_idct32_34_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_34_4x32_quarter_1_2(io, temp); + highbd_idct32_34_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[32], in[32], out[32]; + + // rows + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + idct32_34_8x32_ssse3(in, col); + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col + i, in); + idct32_34_8x32_ssse3(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_34_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_34_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c new file mode 100644 index 0000000000..2e54d24736 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0, + const __m128i in1) { + const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 1 + const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 2, 3 + const __m128i t2 = _mm_unpacklo_epi64(t0, t1); // 0, 1, 2, 3 + return dct_const_round_shift_sse2(t2); +} + +static INLINE void highbd_idct4_small_sse2(__m128i *const io) { + const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0); + const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0); + const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0); + __m128i temp1[4], temp2[4], step[4]; + + transpose_32bit_4x4(io, io); + + // Note: There is no 32-bit signed multiply SIMD instruction in SSE2. + // _mm_mul_epu32() is used which can only guarantee the lower 32-bit + // (signed) result is meaningful, which is enough in this function. + + // stage 1 + temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + temp1[1] = _mm_srli_si128(temp1[0], 4); // 1, 3 + temp2[1] = _mm_srli_si128(temp2[0], 4); // 1, 3 + temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16); // ([0] + [2])*cospi_16_64 + temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16); // ([0] + [2])*cospi_16_64 + temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16); // ([0] - [2])*cospi_16_64 + temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16); // ([0] - [2])*cospi_16_64 + step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]); + step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]); + + temp1[3] = _mm_srli_si128(io[1], 4); + temp2[3] = _mm_srli_si128(io[3], 4); + temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24); // input[1] * cospi_24_64 + temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24); // input[1] * cospi_24_64 + temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08); // input[1] * cospi_8_64 + temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08); // input[1] * cospi_8_64 + temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08); // input[3] * cospi_8_64 + temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08); // input[3] * cospi_8_64 + temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24); // input[3] * cospi_24_64 + temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24); // input[3] * cospi_24_64 + temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]); // [1]*cospi_24 - [3]*cospi_8 + temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]); // [1]*cospi_24 - [3]*cospi_8 + temp2[0] = _mm_add_epi64(temp2[0], temp2[2]); // [1]*cospi_8 + [3]*cospi_24 + temp2[1] = _mm_add_epi64(temp2[1], temp2[3]); // [1]*cospi_8 + [3]*cospi_24 + step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]); + step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +static INLINE void highbd_idct4_large_sse2(__m128i *const io) { + __m128i step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]); + highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int16_t max = 0, min = 0; + __m128i io[4], io_short[2]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + + if (bd != 8) { + __m128i max_input, min_input; + + max_input = _mm_max_epi16(io_short[0], io_short[1]); + min_input = _mm_min_epi16(io_short[0], io_short[1]); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8)); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4)); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2)); + max = _mm_extract_epi16(max_input, 0); + min = _mm_extract_epi16(min_input, 0); + } + + if (bd == 8 || (max < 4096 && min >= -4096)) { + idct4_sse2(io_short); + idct4_sse2(io_short); + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + if (max < 32767 && min > -32768) { + highbd_idct4_small_sse2(io); + highbd_idct4_small_sse2(io); + } else { + highbd_idct4_large_sse2(io); + highbd_idct4_large_sse2(io); + } + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} + +void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int a1, i; + tran_low_t out; + __m128i dc, d; + + out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + dc = _mm_set1_epi16(a1); + + for (i = 0; i < 4; ++i) { + d = _mm_loadl_epi64((const __m128i *)dest); + d = add_clamp(d, dc, bd); + _mm_storel_epi64((__m128i *)dest, d); + dest += stride; + } +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c new file mode 100644 index 0000000000..38e64f3bc9 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static INLINE void highbd_idct4(__m128i *const io) { + __m128i temp[2], step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + extend_64bit(temp[0], temp); + step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp[0], temp); + step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[4]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + if (bd == 8) { + __m128i io_short[2]; + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + idct4_sse2(io_short); + idct4_sse2(io_short); + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + highbd_idct4(io); + highbd_idct4(io); + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c new file mode 100644 index 0000000000..909a6b7948 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static void highbd_idct8x8_half1d(__m128i *const io) { + __m128i step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 2 + highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]); + highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[4], sign[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + abs_extend_64bit_sse2(io[1], temp1, sign); + step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64); + step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64); + abs_extend_64bit_sse2(io[3], temp1, sign); + step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64); + step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64); + + // stage 2 + abs_extend_64bit_sse2(step1[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + abs_extend_64bit_sse2(step1[1], temp1, sign); + step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64); + step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + idct8_sse2(io_short); + idct8_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_half1d(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + highbd_idct8x8_half1d(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + highbd_idct8x8_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); + + idct8x8_12_add_kernel_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 8); +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c new file mode 100644 index 0000000000..ae391b2c02 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static void highbd_idct8x8_half1d(__m128i *const io) { + __m128i step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 2 + highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]); + highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64, + &step2[2], &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + extend_64bit(io[1], temp1); + step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64); + step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64); + extend_64bit(io[3], temp1); + step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64); + step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64); + + // stage 2 + extend_64bit(step1[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + extend_64bit(step1[1], temp1); + step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64); + step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + idct8_sse2(io_short); + idct8_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_half1d(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + highbd_idct8x8_half1d(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + highbd_idct8x8_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); + + idct8x8_12_add_kernel_ssse3(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c new file mode 100644 index 0000000000..2051381aa8 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); + } +} + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8x8(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)left; + (void)bd; + dc_store_8x8(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_8x8(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 16; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16x16(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16x16(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16x16(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 32; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32x32(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32x32(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32x32(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); + const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); + const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); + const __m128i row0 = _mm_srli_si128(avg2, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg2, 4); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + + dst -= stride; + dst[0] = _mm_extract_epi16(avg3, 1); + dst[stride] = _mm_extract_epi16(avg3, 0); +} + +void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); + const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); + const __m128i row0 = _mm_srli_si128(avg3, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg3, 2); + const __m128i row3 = avg3; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); + const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); + const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); + const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); + const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); + const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); + const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); + const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); + const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); + const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row2 = _mm_srli_si128(row3, 4); + const __m128i row1 = _mm_srli_si128(row3, 8); + const __m128i row0 = _mm_srli_si128(avg3, 4); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst[0] = _mm_extract_epi16(avg2, 3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left); + const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff); + const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000); + const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2); + const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4); + const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00); + const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0); + const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row1 = _mm_srli_si128(row0, 4); + const __m128i row2 = _mm_srli_si128(row0, 8); + const __m128i row3 = LLLL0000; + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0); + const __m128i row0 = avg2; + const __m128i row1 = avg3; + const __m128i row2 = _mm_srli_si128(avg2, 2); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c new file mode 100644 index 0000000000..b9dcef205b --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c @@ -0,0 +1,930 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); + dst[3] = above[7]; // aka H +} + +static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, + __m128i *row, const __m128i *ar) { + *row = _mm_alignr_epi8(*ar, *row, 2); + _mm_store_si128((__m128i *)*dst, *row); + *dst += stride; +} + +void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3); + dst += stride; + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); +} + +static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, + __m128i *row_0, __m128i *row_1, + const __m128i *ar) { + *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2); + *row_1 = _mm_alignr_epi8(*ar, *row_1, 2); + _mm_store_si128((__m128i *)*dst, *row_0); + _mm_store_si128((__m128i *)(*dst + 8), *row_1); + *dst += stride; +} + +void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); +} + +void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + int i; + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + for (i = 1; i < 32; ++i) { + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + } +} + +DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 +}; + +static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { + *a = _mm_shuffle_epi8(*a, *rotrw); + return *a; +} + +void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i IXABCDEF = + _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); + __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); + __m128i rowa = avg2; + __m128i rowb = avg3; + int i; + (void)bd; + for (i = 0; i < 8; i += 2) { + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb); + dst += stride; + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); + } +} + +void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_srli_si128(L1, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + dst += stride; + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); + const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); + const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); + const __m128i L3_ = _mm_srli_si128(L3, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowa_2 = avg2_2; + __m128i rowa_3 = avg2_3; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i rowb_2 = avg3_2; + __m128i rowb_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); + avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + _mm_store_si128((__m128i *)(dst + 16), rowb_2); + _mm_store_si128((__m128i *)(dst + 24), rowb_3); + dst += stride; + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); + rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); + const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); + __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + __m128i rowa = avg3; + int i; + (void)bd; + for (i = 0; i < 8; ++i) { + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + } +} + +void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_srli_si128(B1, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + } + } +} + +void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); + const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); + const __m128i C3 = _mm_srli_si128(B3, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); + const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i rowa_2 = avg3_2; + __m128i rowa_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); + avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + } + } +} + +void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); + const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); + const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); + const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); + const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); + const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); + const __m128i row0 = + _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); + const __m128i row1 = + _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); + const __m128i row2 = + _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); + const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); + const __m128i row4 = + _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); + const __m128i row5 = + _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); + const __m128i row6 = + _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); + const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); + (void)bd; + _mm_store_si128((__m128i *)dst, row0); + dst += stride; + _mm_store_si128((__m128i *)dst, row1); + dst += stride; + _mm_store_si128((__m128i *)dst, row2); + dst += stride; + _mm_store_si128((__m128i *)dst, row3); + dst += stride; + _mm_store_si128((__m128i *)dst, row4); + dst += stride; + _mm_store_si128((__m128i *)dst, row5); + dst += stride; + _mm_store_si128((__m128i *)dst, row6); + dst += stride; + _mm_store_si128((__m128i *)dst, row7); +} + +void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_srli_si128(A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_srli_si128(A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i avg2_avg3_left[2][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + } + } +} + +void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_srli_si128(A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_srli_si128(A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); + const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); + const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); + const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i row_2 = avg3_2; + __m128i row_3 = avg3_3; + __m128i avg2_avg3_left[4][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); + avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); + + for (j = 0; j < 4; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + } + } +} + +static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3); + const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3); + (void)above; + (void)bd; + d207_store_4x8(&dst, stride, &out_a, &out_b); + d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH); +} + +static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(LR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(LR, A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + (void)above; + (void)bd; + d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c); + d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d); + d207_store_4x16(&dst, stride, &out_c, &out_d, &LR); + d207_store_4x16(&dst, stride, &out_d, &LR, &LR); +} + +static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c, const __m128i *d, + const __m128i *e) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + _mm_store_si128((__m128i *)(*dst + 16), *c); + _mm_store_si128((__m128i *)(*dst + 24), *d); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(LR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(LR, A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2); + const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2); + const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3); + const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3); + (void)above; + (void)bd; + d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e); + d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f); + d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g); + d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h); + d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR); + d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR); + d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR); + d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR); +} + +static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride, + __m128i *a, __m128i *b, const __m128i *ar) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, *b); + *dst += stride; + *a = _mm_alignr_epi8(*ar, *a, 2); + *b = _mm_alignr_epi8(*ar, *b, 2); + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, *b); + *dst += stride; + *a = _mm_alignr_epi8(*ar, *a, 2); + *b = _mm_alignr_epi8(*ar, *b, 2); +} + +void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + (void)left; + (void)bd; + d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); + d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); +} + +void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg2_0 = _mm_avg_epu16(A0, B0); + __m128i avg2_1 = _mm_avg_epu16(A1, B1); + int i; + (void)left; + (void)bd; + for (i = 0; i < 14; i += 2) { + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); + avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2); + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2); + } + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); +} + +void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + __m128i avg2_0 = _mm_avg_epu16(A0, B0); + __m128i avg2_1 = _mm_avg_epu16(A1, B1); + __m128i avg2_2 = _mm_avg_epu16(A2, B2); + __m128i avg2_3 = _mm_avg_epu16(A3, B3); + int i; + (void)left; + (void)bd; + for (i = 0; i < 30; i += 2) { + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + _mm_store_si128((__m128i *)(dst + 16), avg2_2); + _mm_store_si128((__m128i *)(dst + 24), avg2_3); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); + avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2); + avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2); + avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2); + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + } + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + _mm_store_si128((__m128i *)(dst + 16), avg2_2); + _mm_store_si128((__m128i *)(dst + 24), avg2_3); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); +} diff --git a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h new file mode 100644 index 0000000000..e0f7495521 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ +#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void extend_64bit(const __m128i in, + __m128i *const out /*out[2]*/) { + out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1 + out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3 +} + +static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1, + const __m128i rounding) { + __m128i temp[2]; + temp[0] = _mm_add_epi32(in0, rounding); + temp[1] = _mm_add_epi32(in1, rounding); + temp[0] = _mm_srai_epi32(temp[0], 4); + temp[1] = _mm_srai_epi32(temp[1], 4); + return _mm_packs_epi32(temp[0], temp[1]); +} + +static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1, + const __m128i rounding) { + __m128i temp[2]; + temp[0] = _mm_add_epi32(in0, rounding); + temp[1] = _mm_add_epi32(in1, rounding); + temp[0] = _mm_srai_epi32(temp[0], 5); + temp[1] = _mm_srai_epi32(temp[1], 5); + return _mm_packs_epi32(temp[0], temp[1]); +} + +static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) { + const __m128i t = + _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0)); + return _mm_srli_si128(t, 2); +} + +static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) { + const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2 + const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3 + return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3 +} + +static INLINE void abs_extend_64bit_sse2(const __m128i in, + __m128i *const out /*out[2]*/, + __m128i *const sign /*sign[2]*/) { + sign[0] = _mm_srai_epi32(in, 31); + out[0] = _mm_xor_si128(in, sign[0]); + out[0] = _mm_sub_epi32(out[0], sign[0]); + sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3 + sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1 + out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3 + out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1 +} + +// Note: cospi must be non negative. +static INLINE __m128i multiply_apply_sign_sse2(const __m128i in, + const __m128i sign, + const __m128i cospi) { + __m128i out = _mm_mul_epu32(in, cospi); + out = _mm_xor_si128(out, sign); + return _mm_sub_epi64(out, sign); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + assert(c >= 0); + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_neg_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + assert(c >= 0); + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = _mm_sub_epi64(_mm_setzero_si128(), t0); + t1 = _mm_sub_epi64(_mm_setzero_si128(), t1); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); + const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); + __m128i temp1[4], temp2[4], sign1[2], sign2[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in0, temp1, sign1); + abs_extend_64bit_sse2(in1, temp2, sign2); + temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1); + temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1); + temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0); + temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0); + temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0); + temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0); + temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1); + temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0, + const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_round_shift_sse2(temp, sign, c0); + *out1 = multiplication_round_shift_sse2(temp, sign, c1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1); + *out1 = multiplication_round_shift_sse2(temp, sign, c0); +} + +static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2, sign[2]; + + temp2 = _mm_add_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi32(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]); + i++; + } +} + +static INLINE void highbd_idct8_stage4(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); +} + +static INLINE void highbd_idct8x8_final_round(__m128i *const io) { + io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); + io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); + io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); + io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); + io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); + io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); + io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); + io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); +} + +static INLINE void highbd_idct16_4col_stage7(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[15]); + out[1] = _mm_add_epi32(in[1], in[14]); + out[2] = _mm_add_epi32(in[2], in[13]); + out[3] = _mm_add_epi32(in[3], in[12]); + out[4] = _mm_add_epi32(in[4], in[11]); + out[5] = _mm_add_epi32(in[5], in[10]); + out[6] = _mm_add_epi32(in[6], in[9]); + out[7] = _mm_add_epi32(in[7], in[8]); + out[8] = _mm_sub_epi32(in[7], in[8]); + out[9] = _mm_sub_epi32(in[6], in[9]); + out[10] = _mm_sub_epi32(in[5], in[10]); + out[11] = _mm_sub_epi32(in[4], in[11]); + out[12] = _mm_sub_epi32(in[3], in[12]); + out[13] = _mm_sub_epi32(in[2], in[13]); + out[14] = _mm_sub_epi32(in[1], in[14]); + out[15] = _mm_sub_epi32(in[0], in[15]); +} + +static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, + const int bd) { + const __m128i zero = _mm_set1_epi16(0); + // Faster than _mm_set1_epi16((1 << bd) - 1). + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i d; + + d = _mm_adds_epi16(in0, in1); + d = _mm_max_epi16(d, zero); + d = _mm_min_epi16(d, max); + + return d; +} + +static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, + uint16_t *dest, int stride, int bd, + const int size) { + int a1, i, j; + tran_low_t out; + __m128i dc, d; + + out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6); + dc = _mm_set1_epi16(a1); + + for (i = 0; i < size; ++i) { + for (j = 0; j < size; j += 8) { + d = _mm_load_si128((const __m128i *)(&dest[j])); + d = add_clamp(d, dc, bd); + _mm_store_si128((__m128i *)(&dest[j]), d); + } + dest += stride; + } +} + +static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest, + const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)dest); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)dest, d); +} + +static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); + d = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride))); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d)); +} + +static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_4x2(in[0], dest, stride, bd); + dest += 2 * stride; + recon_and_store_4x2(in[1], dest, stride, bd); +} + +static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_load_si128((const __m128i *)(*dest)); + d = add_clamp(d, in, bd); + _mm_store_si128((__m128i *)(*dest), d); + *dest += stride; +} + +static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_8(in[0], &dest, stride, bd); + recon_and_store_8(in[1], &dest, stride, bd); + recon_and_store_8(in[2], &dest, stride, bd); + recon_and_store_8(in[3], &dest, stride, bd); + recon_and_store_8(in[4], &dest, stride, bd); + recon_and_store_8(in[5], &dest, stride, bd); + recon_and_store_8(in[6], &dest, stride, bd); + recon_and_store_8(in[7], &dest, stride, bd); +} + +static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) { + const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0)); + const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4)); + return _mm_packs_epi32(t0, t1); +} + +static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_pack_8_32bit(input + 0 * stride); + in[1] = load_pack_8_32bit(input + 1 * stride); + in[2] = load_pack_8_32bit(input + 2 * stride); + in[3] = load_pack_8_32bit(input + 3 * stride); + in[4] = load_pack_8_32bit(input + 4 * stride); + in[5] = load_pack_8_32bit(input + 5 * stride); + in[6] = load_pack_8_32bit(input + 6 * stride); + in[7] = load_pack_8_32bit(input + 7 * stride); + transpose_16bit_8x8(in, in); +} + +static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4)); + in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4)); + in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4)); + in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4)); + transpose_32bit_8x4(in, in); +} + +static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + transpose_32bit_4x4(in, in); +} + +static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in, + const int bd) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store_8(out, &dest, 0, bd); +} + +static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in, + const int bd) { + const __m128i final_rounding = _mm_set1_epi32(1 << 5); + __m128i out; + + out = _mm_add_epi32(in, final_rounding); + out = _mm_srai_epi32(out, 6); + out = _mm_packs_epi32(out, out); + recon_and_store_4(out, dest, bd); +} + +#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h new file mode 100644 index 0000000000..9c8eef40f7 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ +#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ + +#include // SSE4.1 + +#include "./vpx_config.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" + +static INLINE __m128i multiplication_round_shift_sse4_1( + const __m128i *const in /*in[2]*/, const int c) { + const __m128i pair_c = pair_set_epi32(c * 4, 0); + __m128i t0, t1; + + t0 = _mm_mul_epi32(in[0], pair_c); + t1 = _mm_mul_epi32(in[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i temp1[4], temp2[4]; + + extend_64bit(in0, temp1); + extend_64bit(in1, temp2); + temp1[2] = _mm_mul_epi32(temp1[0], pair_c1); + temp1[3] = _mm_mul_epi32(temp1[1], pair_c1); + temp1[0] = _mm_mul_epi32(temp1[0], pair_c0); + temp1[1] = _mm_mul_epi32(temp1[1], pair_c0); + temp2[2] = _mm_mul_epi32(temp2[0], pair_c0); + temp2[3] = _mm_mul_epi32(temp2[1], pair_c0); + temp2[0] = _mm_mul_epi32(temp2[0], pair_c1); + temp2[1] = _mm_mul_epi32(temp2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2; + + temp2 = _mm_add_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); +} + +static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2]; + + extend_64bit(in, temp); + *out0 = multiplication_round_shift_sse4_1(temp, c0); + *out1 = multiplication_round_shift_sse4_1(temp, c1); +} + +#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ diff --git a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c index 7d66411080..ec22db9f4c 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -12,7 +12,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { __m128i ubounded; @@ -48,10 +47,10 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { // TODO(debargha, peter): Break up large functions into smaller ones // in this file. -void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { +void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi16(1); __m128i blimit, limit, thresh; @@ -475,12 +474,12 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, _mm_store_si128((__m128i *)(s - 0 * p), q0); } -void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, +void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { - vpx_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); - vpx_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); + vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd); + vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd); } void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, @@ -1108,8 +1107,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, highbd_transpose(src, p, dst, 8, 2); // Loop filtering - vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, - bd); + vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, + bd); src[0] = t_dst; src[1] = t_dst + 8 * 8; dst[0] = s - 8; @@ -1130,7 +1129,7 @@ void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); // Loop filtering - vpx_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, + vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); // Transpose back diff --git a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 64b56223ed..cedf98aff4 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "vpx_dsp/vpx_dsp_common.h" @@ -37,54 +38,54 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); (void)scan; + (void)skip_block; + assert(!skip_block); memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } @@ -105,6 +106,9 @@ void vpx_highbd_quantize_b_32x32_sse2( const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; + (void)skip_block; + assert(!skip_block); + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); @@ -116,38 +120,35 @@ void vpx_highbd_quantize_b_32x32_sse2( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob + 1; } diff --git a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 30ee81b688..d9a6932e0b 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -72,7 +72,7 @@ SECTION .text paddd m6, m4 mov r1, ssem ; r1 = unsigned int *sse movd [r1], m7 ; store sse - movd rax, m6 ; store sum as return value + movd eax, m6 ; store sum as return value %endif RET %endmacro diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm index 923418a992..e646767e19 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;unsigned int vpx_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c index 414ae5de1a..a6f7c3d25d 100644 --- a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -135,7 +135,7 @@ HIGH_GET_VAR(8); highbd_8_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } \ \ uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ @@ -293,12 +293,13 @@ DECLS(sse2); } \ } \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ } \ \ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ @@ -328,7 +329,8 @@ DECLS(sse2); se = ROUND_POWER_OF_TWO(se, 2); \ sse = ROUND_POWER_OF_TWO(sse, 4); \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } \ \ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ @@ -337,6 +339,7 @@ DECLS(sse2); int start_row; \ uint32_t sse; \ int se = 0; \ + int64_t var; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ @@ -375,7 +378,8 @@ DECLS(sse2); se = ROUND_POWER_OF_TWO(se, 4); \ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } #define FNS(opt) \ @@ -444,13 +448,14 @@ DECLS(sse2); } \ } \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ } \ \ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ + int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ @@ -481,7 +486,8 @@ DECLS(sse2); se = ROUND_POWER_OF_TWO(se, 2); \ sse = ROUND_POWER_OF_TWO(sse, 4); \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } \ \ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ @@ -489,6 +495,7 @@ DECLS(sse2); const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ int start_row; \ + int64_t var; \ uint32_t sse; \ int se = 0; \ uint64_t long_sse = 0; \ @@ -530,7 +537,8 @@ DECLS(sse2); se = ROUND_POWER_OF_TWO(se, 4); \ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } #define FNS(opt1) \ diff --git a/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm index c18095c287..61af6236ed 100644 --- a/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm @@ -61,7 +61,7 @@ cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset psrlq m3, 8 movd [dstq+strideq ], m3 psrlq m0, 56 - movd tempq, m0 + movd tempd, m0 mov [dstq+strideq+3], tempb RESTORE_GOT diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index cb56ad0789..f6e56b6f9e 100644 --- a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -8,169 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ - } - -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; - - // Rows - input0 = load_input_data(input); - input2 = load_input_data(input + 8); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } -} - -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 4); - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -static INLINE void transpose_4x4(__m128i *res) { +static INLINE void transpose_16bit_4(__m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); @@ -178,35 +23,75 @@ static INLINE void transpose_4x4(__m128i *res) { res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); } -void idct4_sse2(__m128i *in) { +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i eight = _mm_set1_epi16(8); + __m128i in[2]; + + // Rows + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); + idct4_sse2(in); + + // Columns + idct4_sse2(in); + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + + recon_and_store4x4_sse2(in, dest, stride); +} + +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + int a; + __m128i dc_value, d[2]; + + a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + dc_value = _mm_set1_epi16(a); + + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], dc_value); + d[1] = _mm_add_epi16(d[1], dc_value); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); +} + +void idct4_sse2(__m128i *const in) { const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; + __m128i u[2]; - transpose_4x4(in); + transpose_16bit_4(in); // stage 1 u[0] = _mm_unpacklo_epi16(in[0], in[1]); u[1] = _mm_unpackhi_epi16(in[0], in[1]); - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[3], v[2]); + u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]); + u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]); // stage 2 in[0] = _mm_add_epi16(u[0], u[1]); @@ -214,7 +99,7 @@ void idct4_sse2(__m128i *in) { in[1] = _mm_shuffle_epi32(in[1], 0x4E); } -void iadst4_sse2(__m128i *in) { +void iadst4_sse2(__m128i *const in) { const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); @@ -224,7 +109,7 @@ void iadst4_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8], in7; - transpose_4x4(in); + transpose_16bit_4(in); in7 = _mm_srli_si128(in[1], 8); in7 = _mm_add_epi16(in7, in[0]); in7 = _mm_sub_epi16(in7, in[1]); @@ -263,309 +148,93 @@ void iadst4_sse2(__m128i *in) { in[1] = _mm_packs_epi32(u[2], u[3]); } -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ - res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } - -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - } - -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ - out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ - stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ - } +static INLINE void load_buffer_8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 8); + in[1] = load_input_data8(input + 1 * 8); + in[2] = load_input_data8(input + 2 * 8); + in[3] = load_input_data8(input + 3 * 8); + in[4] = load_input_data8(input + 4 * 8); + in[5] = load_input_data8(input + 5 * 8); + in[6] = load_input_data8(input + 6 * 8); + in[7] = load_input_data8(input + 7 * 8); +} void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in[8]; int i; // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); + load_buffer_8x8(input, in); // 2-D for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, - in6, in7); + idct8_sse2(in); } - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); + write_buffer_8x8(in, dest, stride); +} - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[8]; - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); + + idct8x8_12_add_kernel_sse2(io); + write_buffer_8x8(io, dest, stride); +} + +static INLINE void recon_and_store_8_dual(uint8_t *const dest, + const __m128i in_x, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride)); + d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride)); + d0 = _mm_unpacklo_epi8(d0, zero); + d1 = _mm_unpacklo_epi8(d1, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0)); } void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + dc_value = _mm_set1_epi16((int16_t)a1); - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); } -void idct8_sse2(__m128i *in) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - +void idct8_sse2(__m128i *const in) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, - in1, in2, in3, in4, in5, in6, in7); + transpose_16bit_8x8(in, in); // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], - in[4], in[5], in[6], in[7]); + idct8(in, in); } -void iadst8_sse2(__m128i *in) { +void iadst8_sse2(__m128i *const in) { const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); @@ -578,7 +247,7 @@ void iadst8_sse2(__m128i *in) { const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__const_0 = _mm_set1_epi16(0); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -589,7 +258,7 @@ void iadst8_sse2(__m128i *in) { __m128i in0, in1, in2, in3, in4, in5, in6, in7; // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); // properly aligned for butterfly input in0 = in[7]; @@ -751,37 +420,10 @@ void iadst8_sse2(__m128i *in) { u2 = _mm_unpacklo_epi16(s6, s7); u3 = _mm_unpackhi_epi16(s6, s7); - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); + s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16); + s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16); + s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16); + s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16); in[0] = s0; in[1] = _mm_sub_epi16(k__const_0, s4); @@ -793,521 +435,133 @@ void iadst8_sse2(__m128i *in) { in[7] = _mm_sub_epi16(k__const_0, s1); } -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - stp1_5 = _mm_packs_epi32(tmp4, tmp6); - } - - // Stage2 - { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_04, stg2_0); - tmp2 = _mm_madd_epi16(lo_04, stg2_1); - tmp4 = _mm_madd_epi16(lo_26, stg2_2); - tmp6 = _mm_madd_epi16(lo_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp0, tmp2); - stp2_2 = _mm_packs_epi32(tmp6, tmp4); - - tmp0 = _mm_adds_epi16(stp1_4, stp1_5); - tmp1 = _mm_subs_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage3 - { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - - tmp4 = _mm_adds_epi16(stp2_0, stp2_2); - tmp6 = _mm_subs_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); - stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); - - tmp0 = _mm_madd_epi16(lo_56, stg3_0); - tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - tmp0 = _mm_adds_epi16(stp1_3, stp2_4); - tmp1 = _mm_adds_epi16(stp1_2, stp1_5); - tmp2 = _mm_subs_epi16(stp1_3, stp2_4); - tmp3 = _mm_subs_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, - in5, in6, in7); - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); +static INLINE void idct16_load8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); } -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ - stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ - stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ - stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ - stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i l[16], r[16], out[16], *in; int i; - curr1 = l; + in = l; for (i = 0; i < 2; i++) { - // 1-D idct - - // Load input data. - in[0] = load_input_data(input); - in[8] = load_input_data(input + 8 * 1); - in[1] = load_input_data(input + 8 * 2); - in[9] = load_input_data(input + 8 * 3); - in[2] = load_input_data(input + 8 * 4); - in[10] = load_input_data(input + 8 * 5); - in[3] = load_input_data(input + 8 * 6); - in[11] = load_input_data(input + 8 * 7); - in[4] = load_input_data(input + 8 * 8); - in[12] = load_input_data(input + 8 * 9); - in[5] = load_input_data(input + 8 * 10); - in[13] = load_input_data(input + 8 * 11); - in[6] = load_input_data(input + 8 * 12); - in[14] = load_input_data(input + 8 * 13); - in[7] = load_input_data(input + 8 * 14); - in[15] = load_input_data(input + 8 * 15); - - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); + idct16_load8x8(input + 8, in + 8); + transpose_16bit_8x8(in + 8, in + 8); + idct16_8col(in, in); + in = r; input += 128; } - for (i = 0; i < 2; i++) { + + for (i = 0; i < 16; i += 8) { int j; - // 1-D idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + write_buffer_8x1(dest + j * stride, out[j]); } dest += 8; } } +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], temp[16], out[16]; + int i; + + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); + + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, out[j]); + } + + dest += 8; + } +} + +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], l[16]; + int i; + + // First 1-D inverse DCT + // Load input data. + in[0] = load_input_data4(input + 0 * 16); + in[1] = load_input_data4(input + 1 * 16); + in[2] = load_input_data4(input + 2 * 16); + in[3] = load_input_data4(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_load_si128((__m128i *)(dest)); + d1 = _mm_unpackhi_epi8(d0, zero); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_store_si128((__m128i *)(dest), d0); +} + void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; + int i; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16((int16_t)a1); for (i = 0; i < 16; ++i) { - RECON_AND_STORE(dest + 0, dc_value); - RECON_AND_STORE(dest + 8, dc_value); + recon_and_store_16(dest, dc_value); dest += stride; } } -static void iadst16_8col(__m128i *in) { +static void iadst16_8col(__m128i *const in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -1335,8 +589,8 @@ static void iadst16_8col(__m128i *in) { const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -1708,1722 +962,371 @@ static void iadst16_8col(__m128i *in) { u[6] = _mm_unpacklo_epi16(s[14], s[15]); u[7] = _mm_unpackhi_epi16(s[14], s[15]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16); + in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); + in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); + in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16); + in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16); + in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16); + in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16); + in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16); in[0] = s[0]; in[1] = _mm_sub_epi16(kZero, s[8]); in[2] = s[12]; in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); in[12] = s[5]; in[13] = _mm_sub_epi16(kZero, s[13]); in[14] = s[9]; in[15] = _mm_sub_epi16(kZero, s[1]); } -static void idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i v[16], u[16], s[16], t[16]; - - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); - v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); - v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); - v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); - v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); - v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); - v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); - v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); - v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); - v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); - v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); - v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); - v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); - v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[8] = _mm_packs_epi32(u[0], u[1]); - s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); - s[14] = _mm_packs_epi32(u[6], u[7]); - s[10] = _mm_packs_epi32(u[8], u[9]); - s[13] = _mm_packs_epi32(u[10], u[11]); - s[11] = _mm_packs_epi32(u[12], u[13]); - s[12] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[4] = _mm_packs_epi32(u[0], u[1]); - t[7] = _mm_packs_epi32(u[2], u[3]); - t[5] = _mm_packs_epi32(u[4], u[5]); - t[6] = _mm_packs_epi32(u[6], u[7]); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); - v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); - v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); - s[14] = _mm_packs_epi32(u[10], u[11]); - s[10] = _mm_packs_epi32(u[12], u[13]); - s[13] = _mm_packs_epi32(u[14], u[15]); - s[11] = t[11]; - s[12] = t[12]; - - // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - t[5] = _mm_packs_epi32(u[0], u[1]); - t[6] = _mm_packs_epi32(u[2], u[3]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); - - // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - s[10] = _mm_packs_epi32(u[0], u[1]); - s[13] = _mm_packs_epi32(u[2], u[3]); - s[11] = _mm_packs_epi32(u[4], u[5]); - s[12] = _mm_packs_epi32(u[6], u[7]); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); +void idct16_sse2(__m128i *const in0, __m128i *const in1) { + transpose_16bit_16x16(in0, in1); + idct16_8col(in0, in0); + idct16_8col(in1, in1); } -void idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - idct16_8col(in0); - idct16_8col(in1); -} - -void iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); +void iadst16_sse2(__m128i *const in0, __m128i *const in1) { + transpose_16bit_16x16(in0, in1); iadst16_8col(in0); iadst16_8col(in1); } -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { const __m128i zero = _mm_setzero_si128(); + __m128i step1[8], step2[8]; - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + // stage 3 + butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + // stage 4 + step2[0] = butterfly_cospi16(in[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[0]; + step1[2] = step2[0]; + step1[3] = step2[0]; + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, - stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, - stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 2); - in[2] = load_input_data(input + 8 * 4); - in[3] = load_input_data(input + 8 * 6); - - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); - tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp2_8 = _mm_packs_epi32(tmp0, tmp2); - stp2_11 = _mm_packs_epi32(tmp5, tmp7); - } - - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); - - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); - tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); - tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); - tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp2, tmp2); - stp2_9 = _mm_packs_epi32(tmp1, tmp3); - stp2_10 = _mm_packs_epi32(tmp5, tmp7); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } - - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } - - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_6 = _mm_packs_epi32(tmp3, tmp1); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp2, zero); - stp2_11 = _mm_packs_epi32(tmp4, zero); - stp2_12 = _mm_packs_epi32(tmp6, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); - - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - array_transpose_4X8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); } -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = load_input_data(input); \ - input += 8; \ - } +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[16], step2[16]; -#define IDCT32_34 \ - /* Stage1 */ \ - { \ - const __m128i zero = _mm_setzero_si128(); \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ - stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ - stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ - stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ - stp1_24); \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i zero = _mm_setzero_si128(); \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ - stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ - stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i zero = _mm_setzero_si128(); \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ - stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i zero = _mm_setzero_si128(); \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ - stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } + // stage 2 + butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); -#define IDCT32 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ - stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ - stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_34_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_34_8x32_quarter_1(in, temp); + idct32_34_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[32]; + + // stage 1 + butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 3 + butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_34_8x32_quarter_1_2(in, temp); + idct32_34_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} // Only upper-left 8x8 has non-zero coeff void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[32]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i io[32], col[32]; int i; // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); + load_transpose_16bit_8x8(input, 32, io); + idct32_34_8x32_sse2(io, col); - array_transpose_8x8(in, in); - IDCT32_34 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[0] = _mm_add_epi16(stp1_0, stp1_31); - col[1] = _mm_add_epi16(stp1_1, stp1_30); - col[2] = _mm_add_epi16(stp1_2, stp1_29); - col[3] = _mm_add_epi16(stp1_3, stp1_28); - col[4] = _mm_add_epi16(stp1_4, stp1_27); - col[5] = _mm_add_epi16(stp1_5, stp1_26); - col[6] = _mm_add_epi16(stp1_6, stp1_25); - col[7] = _mm_add_epi16(stp1_7, stp1_24); - col[8] = _mm_add_epi16(stp1_8, stp1_23); - col[9] = _mm_add_epi16(stp1_9, stp1_22); - col[10] = _mm_add_epi16(stp1_10, stp1_21); - col[11] = _mm_add_epi16(stp1_11, stp1_20); - col[12] = _mm_add_epi16(stp1_12, stp1_19); - col[13] = _mm_add_epi16(stp1_13, stp1_18); - col[14] = _mm_add_epi16(stp1_14, stp1_17); - col[15] = _mm_add_epi16(stp1_15, stp1_16); - col[16] = _mm_sub_epi16(stp1_15, stp1_16); - col[17] = _mm_sub_epi16(stp1_14, stp1_17); - col[18] = _mm_sub_epi16(stp1_13, stp1_18); - col[19] = _mm_sub_epi16(stp1_12, stp1_19); - col[20] = _mm_sub_epi16(stp1_11, stp1_20); - col[21] = _mm_sub_epi16(stp1_10, stp1_21); - col[22] = _mm_sub_epi16(stp1_9, stp1_22); - col[23] = _mm_sub_epi16(stp1_8, stp1_23); - col[24] = _mm_sub_epi16(stp1_7, stp1_24); - col[25] = _mm_sub_epi16(stp1_6, stp1_25); - col[26] = _mm_sub_epi16(stp1_5, stp1_26); - col[27] = _mm_sub_epi16(stp1_4, stp1_27); - col[28] = _mm_sub_epi16(stp1_3, stp1_28); - col[29] = _mm_sub_epi16(stp1_2, stp1_29); - col[30] = _mm_sub_epi16(stp1_1, stp1_30); - col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { + for (i = 0; i < 32; i += 8) { int j; - const __m128i zero = _mm_setzero_si128(); - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - IDCT32_34 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); + transpose_16bit_8x8(col + i, io); + idct32_34_8x32_sse2(io, io); for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + write_buffer_8x1(dest + j * stride, io[j]); } dest += 8; } } +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 4 + butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[11], step2[10]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[15], step2[14]); + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_1024_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_1024_8x32_quarter_1(in, temp); + idct32_1024_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + + butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); + + butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi16(step1[16], step1[17]); + step2[17] = _mm_sub_epi16(step1[16], step1[17]); + step2[18] = _mm_sub_epi16(step1[19], step1[18]); + step2[19] = _mm_add_epi16(step1[19], step1[18]); + step2[20] = _mm_add_epi16(step1[20], step1[21]); + step2[21] = _mm_sub_epi16(step1[20], step1[21]); + step2[22] = _mm_sub_epi16(step1[23], step1[22]); + step2[23] = _mm_add_epi16(step1[23], step1[22]); + + step2[24] = _mm_add_epi16(step1[24], step1[25]); + step2[25] = _mm_sub_epi16(step1[24], step1[25]); + step2[26] = _mm_sub_epi16(step1[27], step1[26]); + step2[27] = _mm_add_epi16(step1[27], step1[26]); + step2[28] = _mm_add_epi16(step1[28], step1[29]); + step2[29] = _mm_sub_epi16(step1[28], step1[29]); + step2[30] = _mm_sub_epi16(step1[31], step1[30]); + step2[31] = _mm_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_1024_8x32(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_1024_8x32_quarter_1_2(in, temp); + idct32_1024_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; + __m128i col[4][32], io[32]; + int i; + // rows for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + load_transpose_16bit_8x8(&input[0], 32, &io[0]); + load_transpose_16bit_8x8(&input[8], 32, &io[8]); + load_transpose_16bit_8x8(&input[16], 32, &io[16]); + load_transpose_16bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; } - for (i = 0; i < 4; i++) { - // Second 1-D idct - j = i << 3; + // columns + for (i = 0; i < 32; i += 8) { // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); - IDCT32 + idct32_1024_8x32(io, io); + store_buffer_8x32(io, dest, stride); + dest += 8; + } +} - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[2][32], in[32], out[32]; + int i; - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } + for (i = 16; i < 32; i++) { + in[i] = _mm_setzero_si128(); + } + // rows + for (i = 0; i < 2; i++) { + load_transpose_16bit_8x8(&input[0], 32, &in[0]); + load_transpose_16bit_8x8(&input[8], 32, &in[8]); + idct32_1024_8x32(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_1024_8x32(in, out); + store_buffer_8x32(out, dest, stride); dest += 8; } } @@ -3431,611 +1334,17 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, j; + int j; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16((int16_t)a1); for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + 0 + j * stride, dc_value); - RECON_AND_STORE(dest + 8 + j * stride, dc_value); - RECON_AND_STORE(dest + 16 + j * stride, dc_value); - RECON_AND_STORE(dest + 24 + j * stride, dc_value); + recon_and_store_16(dest + j * stride + 0, dc_value); + recon_and_store_16(dest + j * stride + 16, dc_value); } } - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - vpx_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} - -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_16x16(inptr, inptr + 16); - for (i = 0; i < 16; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 16; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // Since all non-zero dct coefficients are in upper-left 4x4 area, - // we only need to consider first 4 rows here. - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform (N.B. This transposes inptr) - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 16; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - __m128i dc_value, d; - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); - int a, i, j; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out; - - out = highbd_dct_const_round_shift(input[0] * cospi_16_64); - out = highbd_dct_const_round_shift(out * cospi_16_64); - a = ROUND_POWER_OF_TWO(out, 6); - - d = _mm_set1_epi32(a); - dc_value = _mm_packs_epi32(d, d); - for (i = 0; i < 32; ++i) { - for (j = 0; j < 4; ++j) { - d = _mm_loadu_si128((const __m128i *)(&dest[j * 8])); - d = _mm_adds_epi16(d, dc_value); - d = _mm_max_epi16(d, zero); - d = _mm_min_epi16(d, max); - _mm_storeu_si128((__m128i *)(&dest[j * 8]), d); - } - dest += stride; - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h index d762a04abc..5cd5098f14 100644 --- a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h @@ -12,185 +12,698 @@ #define VPX_DSP_X86_INV_TXFM_SSE2_H_ #include // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); +static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 30 31 32 33 00 01 02 03 + // in[1]: 20 21 22 23 10 11 12 13 + // in[2]: 40 41 42 43 70 71 72 73 + // in[3]: 50 51 52 53 60 61 62 63 + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + // tr0_2: 40 50 41 51 42 52 43 53 + // tr0_3: 60 70 61 71 62 72 63 73 + const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]); + const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]); + // Unpack 32 bit elements resulting in: + // tr1_0: 00 10 20 30 01 11 21 31 + // tr1_1: 02 12 22 32 03 13 23 33 + // tr1_2: 40 50 60 70 41 51 61 71 + // tr1_3: 42 52 62 72 43 53 63 73 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - -static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { + const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); + return _mm_srai_epi32(t, DCT_CONST_BITS); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; +static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in, + const __m128i cospi) { + const __m128i t = _mm_madd_epi16(in, cospi); + return dct_const_round_shift_sse2(t); } -// Function to allow 8 bit optimisations to be used when profile 0 is used with +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0, + const __m128i in1, + const __m128i x) { + const __m128i t0 = idct_madd_round_shift_sse2(in0, x); + const __m128i t1 = idct_madd_round_shift_sse2(in1, x); + return _mm_packs_epi32(t0, t1); +} + +// Multiply elements by constants and add them together. +static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0, + const int c1, __m128i *const out0, + __m128i *const out1) { + const __m128i cst0 = pair_set_epi16(c0, -c1); + const __m128i cst1 = pair_set_epi16(c1, c0); + const __m128i lo = _mm_unpacklo_epi16(in0, in1); + const __m128i hi = _mm_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_sse2(lo, hi, cst0); + *out1 = idct_calc_wraplow_sse2(lo, hi, cst1); +} + +static INLINE __m128i butterfly_cospi16(const __m128i in) { + const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128()); + const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128()); + return idct_calc_wraplow_sse2(lo, hi, cst); +} + +// Functions to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled -static INLINE __m128i load_input_data(const tran_low_t *data) { +static INLINE __m128i load_input_data4(const tran_low_t *data) { #if CONFIG_VP9_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); + const __m128i zero = _mm_setzero_si128(); + const __m128i in = _mm_load_si128((const __m128i *)data); + return _mm_packs_epi32(in, zero); +#else + return _mm_loadl_epi64((const __m128i *)data); +#endif +} + +static INLINE __m128i load_input_data8(const tran_low_t *data) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_load_si128((const __m128i *)data); + const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); + return _mm_packs_epi32(in0, in1); #else return _mm_load_si128((const __m128i *)data); #endif } -static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { - in[0] = load_input_data(input + 0 * 16); - in[1] = load_input_data(input + 1 * 16); - in[2] = load_input_data(input + 2 * 16); - in[3] = load_input_data(input + 3 * 16); - in[4] = load_input_data(input + 4 * 16); - in[5] = load_input_data(input + 5 * 16); - in[6] = load_input_data(input + 6 * 16); - in[7] = load_input_data(input + 7 * 16); - - in[8] = load_input_data(input + 8 * 16); - in[9] = load_input_data(input + 9 * 16); - in[10] = load_input_data(input + 10 * 16); - in[11] = load_input_data(input + 11 * 16); - in[12] = load_input_data(input + 12 * 16); - in[13] = load_input_data(input + 13 * 16); - in[14] = load_input_data(input + 14 * 16); - in[15] = load_input_data(input + 15 * 16); +static INLINE void load_transpose_16bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * stride); + in[1] = load_input_data8(input + 1 * stride); + in[2] = load_input_data8(input + 2 * stride); + in[3] = load_input_data8(input + 3 * stride); + in[4] = load_input_data8(input + 4 * stride); + in[5] = load_input_data8(input + 5 * stride); + in[6] = load_input_data8(input + 6 * stride); + in[7] = load_input_data8(input + 7 * stride); + transpose_16bit_8x8(in, in); } -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ +static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d0 = _mm_packus_epi16(d0, d0); + _mm_storel_epi64((__m128i *)(dest), d0); +} + +static INLINE void round_shift_8x8(const __m128i *const in, + __m128i *const out) { + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + out[0] = _mm_add_epi16(in[0], final_rounding); + out[1] = _mm_add_epi16(in[1], final_rounding); + out[2] = _mm_add_epi16(in[2], final_rounding); + out[3] = _mm_add_epi16(in[3], final_rounding); + out[4] = _mm_add_epi16(in[4], final_rounding); + out[5] = _mm_add_epi16(in[5], final_rounding); + out[6] = _mm_add_epi16(in[6], final_rounding); + out[7] = _mm_add_epi16(in[7], final_rounding); + + out[0] = _mm_srai_epi16(out[0], 5); + out[1] = _mm_srai_epi16(out[1], 5); + out[2] = _mm_srai_epi16(out[2], 5); + out[3] = _mm_srai_epi16(out[3], 5); + out[4] = _mm_srai_epi16(out[4], 5); + out[5] = _mm_srai_epi16(out[5], 5); + out[6] = _mm_srai_epi16(out[6], 5); + out[7] = _mm_srai_epi16(out[7], 5); +} + +static INLINE void write_buffer_8x8(const __m128i *const in, + uint8_t *const dest, const int stride) { + __m128i t[8]; + + round_shift_8x8(in, t); + + recon_and_store(dest + 0 * stride, t[0]); + recon_and_store(dest + 1 * stride, t[1]); + recon_and_store(dest + 2 * stride, t[2]); + recon_and_store(dest + 3 * stride, t[3]); + recon_and_store(dest + 4 * stride, t[4]); + recon_and_store(dest + 5 * stride, t[5]); + recon_and_store(dest + 6 * stride, t[6]); + recon_and_store(dest + 7 * stride, t[7]); +} + +static INLINE void recon_and_store4x4_sse2(const __m128i *const in, + uint8_t *const dest, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d[2]; + + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], in[0]); + d[1] = _mm_add_epi16(d[1], in[1]); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); +} + +static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm_srai_epi16(in[j], 6); + in[j + 1] = _mm_srai_epi16(in[j + 1], 6); + + recon_and_store(dst, in[j]); + dst += stride; + recon_and_store(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store(dest, out); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +static INLINE void idct8(const __m128i *const in /*in[8]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 1 + butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 2 + butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + + // stage 4 + out[0] = _mm_add_epi16(step1[0], step2[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step2[4]); + out[4] = _mm_sub_epi16(step1[3], step2[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8], tmp[4]; + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero); + const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero); + step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 + step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 } -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero); + const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero); + const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0); + step2[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 + } - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); + // stage 3 + { + const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 + } - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + io[4] = io[5] = io[6] = io[7] = zero; + + idct8(io, io); } -void idct4_sse2(__m128i *in); -void idct8_sse2(__m128i *in); -void idct16_sse2(__m128i *in0, __m128i *in1); -void iadst4_sse2(__m128i *in); -void iadst8_sse2(__m128i *in); -void iadst16_sse2(__m128i *in0, __m128i *in1); +static INLINE void idct16_8col(const __m128i *const in /*in[16]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[10], step2[11]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step1[4] = _mm_add_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step1[7] = _mm_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[1], step1[6]); + step2[2] = _mm_add_epi16(step1[2], step1[5]); + step2[3] = _mm_add_epi16(step1[3], step1[4]); + step2[4] = _mm_sub_epi16(step1[3], step1[4]); + step2[5] = _mm_sub_epi16(step1[2], step1[5]); + step2[6] = _mm_sub_epi16(step1[1], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm_add_epi16(step2[0], step1[15]); + out[1] = _mm_add_epi16(step2[1], step1[14]); + out[2] = _mm_add_epi16(step2[2], step2[13]); + out[3] = _mm_add_epi16(step2[3], step2[12]); + out[4] = _mm_add_epi16(step2[4], step2[11]); + out[5] = _mm_add_epi16(step2[5], step2[10]); + out[6] = _mm_add_epi16(step2[6], step1[9]); + out[7] = _mm_add_epi16(step2[7], step1[8]); + out[8] = _mm_sub_epi16(step2[7], step1[8]); + out[9] = _mm_sub_epi16(step2[6], step1[9]); + out[10] = _mm_sub_epi16(step2[5], step2[10]); + out[11] = _mm_sub_epi16(step2[4], step2[11]); + out[12] = _mm_sub_epi16(step2[3], step2[12]); + out[13] = _mm_sub_epi16(step2[2], step2[13]); + out[14] = _mm_sub_epi16(step2[1], step1[14]); + out[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/, + __m128i *const output /*output[16]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i step1[16], step2[16]; + + transpose_16bit_4x4(input, output); + + // stage 2 + { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]); + step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30, + lo_1_15); // step2 8&15 + step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06, + lo_13_3); // step2 11&12 + } + + // stage 3 + { + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero); + step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28, + lo_2_14); // step1 4&7 + step1[13] = _mm_unpackhi_epi64(step2[11], zero); + step1[14] = _mm_unpackhi_epi64(step2[8], zero); + } + + // stage 4 + { + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]); + const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]); + const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16); + step1[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08, + lo_9_14); // step2 9&14 + step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24, + lo_10_13); // step2 10&13 + step2[6] = _mm_unpackhi_epi64(step1[4], zero); + } + + // stage 5 + { + const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]); + step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16, + lo_5_6); // step1 6&5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_unpackhi_epi64(step1[11], zero); + step1[13] = _mm_unpackhi_epi64(step1[10], zero); + step1[14] = _mm_unpackhi_epi64(step1[9], zero); + step1[15] = _mm_unpackhi_epi64(step1[8], zero); + } + + // stage 6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]); + step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_10_13); // step2 10&13 + step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_11_12); // step2 11&12 + step2[13] = _mm_unpackhi_epi64(step2[10], zero); + step2[12] = _mm_unpackhi_epi64(step2[11], zero); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[0] = _mm_unpackhi_epi64(step2[3], zero); + step2[2] = _mm_unpackhi_epi64(step2[1], zero); + step2[5] = _mm_unpackhi_epi64(step2[6], zero); + step2[7] = _mm_unpackhi_epi64(step2[4], zero); + } + + // stage 7. Left 8x16 only. + output[0] = _mm_add_epi16(step2[0], step1[15]); + output[1] = _mm_add_epi16(step2[1], step1[14]); + output[2] = _mm_add_epi16(step2[2], step2[13]); + output[3] = _mm_add_epi16(step2[3], step2[12]); + output[4] = _mm_add_epi16(step2[4], step2[11]); + output[5] = _mm_add_epi16(step2[5], step2[10]); + output[6] = _mm_add_epi16(step2[6], step1[9]); + output[7] = _mm_add_epi16(step2[7], step1[8]); + output[8] = _mm_sub_epi16(step2[7], step1[8]); + output[9] = _mm_sub_epi16(step2[6], step1[9]); + output[10] = _mm_sub_epi16(step2[5], step2[10]); + output[11] = _mm_sub_epi16(step2[4], step2[11]); + output[12] = _mm_sub_epi16(step2[3], step2[12]); + output[13] = _mm_sub_epi16(step2[2], step2[13]); + output[14] = _mm_sub_epi16(step2[1], step1[14]); + output[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/, + __m128i *const io /*io[16]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[16], step2[16]; + + transpose_16bit_4x8(l, io); + + // stage 2 + butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step1[0] = butterfly_cospi16(io[0]); + butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + + // stage 5 + butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[2] = _mm_add_epi16(step1[0], step1[5]); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[5] = _mm_sub_epi16(step1[0], step1[5]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + io[0] = _mm_add_epi16(step2[0], step1[15]); + io[1] = _mm_add_epi16(step2[1], step1[14]); + io[2] = _mm_add_epi16(step2[2], step2[13]); + io[3] = _mm_add_epi16(step2[3], step2[12]); + io[4] = _mm_add_epi16(step2[4], step2[11]); + io[5] = _mm_add_epi16(step2[5], step2[10]); + io[6] = _mm_add_epi16(step2[6], step1[9]); + io[7] = _mm_add_epi16(step2[7], step1[8]); + io[8] = _mm_sub_epi16(step2[7], step1[8]); + io[9] = _mm_sub_epi16(step2[6], step1[9]); + io[10] = _mm_sub_epi16(step2[5], step2[10]); + io[11] = _mm_sub_epi16(step2[4], step2[11]); + io[12] = _mm_sub_epi16(step2[3], step2[12]); + io[13] = _mm_sub_epi16(step2[2], step2[13]); + io[14] = _mm_sub_epi16(step2[1], step1[14]); + io[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct32_8x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi16(step1[16], step1[19]); + step2[17] = _mm_add_epi16(step1[17], step1[18]); + step2[18] = _mm_sub_epi16(step1[17], step1[18]); + step2[19] = _mm_sub_epi16(step1[16], step1[19]); + step2[20] = _mm_sub_epi16(step1[23], step1[20]); + step2[21] = _mm_sub_epi16(step1[22], step1[21]); + step2[22] = _mm_add_epi16(step1[22], step1[21]); + step2[23] = _mm_add_epi16(step1[23], step1[20]); + + step2[24] = _mm_add_epi16(step1[24], step1[27]); + step2[25] = _mm_add_epi16(step1[25], step1[26]); + step2[26] = _mm_sub_epi16(step1[25], step1[26]); + step2[27] = _mm_sub_epi16(step1[24], step1[27]); + step2[28] = _mm_sub_epi16(step1[31], step1[28]); + step2[29] = _mm_sub_epi16(step1[30], step1[29]); + step2[30] = _mm_add_epi16(step1[29], step1[30]); + step2[31] = _mm_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm_add_epi16(step1[16], step1[23]); + out[17] = _mm_add_epi16(step1[17], step1[22]); + out[18] = _mm_add_epi16(step1[18], step1[21]); + out[19] = _mm_add_epi16(step1[19], step1[20]); + step2[20] = _mm_sub_epi16(step1[19], step1[20]); + step2[21] = _mm_sub_epi16(step1[18], step1[21]); + step2[22] = _mm_sub_epi16(step1[17], step1[22]); + step2[23] = _mm_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm_sub_epi16(step1[31], step1[24]); + step2[25] = _mm_sub_epi16(step1[30], step1[25]); + step2[26] = _mm_sub_epi16(step1[29], step1[26]); + step2[27] = _mm_sub_epi16(step1[28], step1[27]); + out[28] = _mm_add_epi16(step1[27], step1[28]); + out[29] = _mm_add_epi16(step1[26], step1[29]); + out[30] = _mm_add_epi16(step1[25], step1[30]); + out[31] = _mm_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]); + butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]); + butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]); + butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]); +} + +void idct4_sse2(__m128i *const in); +void idct8_sse2(__m128i *const in); +void idct16_sse2(__m128i *const in0, __m128i *const in1); +void iadst4_sse2(__m128i *const in); +void iadst8_sse2(__m128i *const in); +void iadst16_sse2(__m128i *const in0, __m128i *const in1); +void idct32_1024_8x32(const __m128i *const in, __m128i *const out); +void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out); +void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out); #endif // VPX_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c new file mode 100644 index 0000000000..6e99469b63 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0, + const int c1, __m128i *const out0, + __m128i *const out1) { + const __m128i cst0 = _mm_set1_epi16(2 * c0); + const __m128i cst1 = _mm_set1_epi16(2 * c1); + *out0 = _mm_mulhrs_epi16(in, cst0); + *out1 = _mm_mulhrs_epi16(in, cst1); +} + +static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) { + const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64); + return _mm_mulhrs_epi16(in, coef_pair); +} + +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[8]; + + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); + + idct8x8_12_add_kernel_ssse3(io); + write_buffer_8x8(io, dest, stride); +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step2[0] = partial_butterfly_cospi16_ssse3(in[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[0]; + step1[2] = step2[0]; + step1[3] = step2[0]; + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_34_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_34_8x32_quarter_1(in, temp); + idct32_34_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32]; + + // stage 1 + partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 3 + butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_34_8x32_quarter_1_2(in, temp); + idct32_34_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + +// Only upper-left 8x8 has non-zero coeff +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[32], col[32]; + int i; + + // Load input data. Only need to load the top left 8x8 block. + load_transpose_16bit_8x8(input, 32, io); + idct32_34_8x32_ssse3(io, col); + + for (i = 0; i < 32; i += 8) { + int j; + transpose_16bit_8x8(col + i, io); + idct32_34_8x32_ssse3(io, io); + + for (j = 0; j < 32; ++j) { + write_buffer_8x1(dest + j * stride, io[j]); + } + + dest += 8; + } +} + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + + // stage 4 + step2[0] = partial_butterfly_cospi16_ssse3(in[0]); + partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[11], step2[10]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[15], step2[14]); + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_135_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_135_8x32_quarter_1(in, temp); + idct32_135_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17], + &step1[30]); + partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21], + &step1[26]); + + partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi16(step1[16], step1[17]); + step2[17] = _mm_sub_epi16(step1[16], step1[17]); + step2[18] = _mm_sub_epi16(step1[19], step1[18]); + step2[19] = _mm_add_epi16(step1[19], step1[18]); + step2[20] = _mm_add_epi16(step1[20], step1[21]); + step2[21] = _mm_sub_epi16(step1[20], step1[21]); + step2[22] = _mm_sub_epi16(step1[23], step1[22]); + step2[23] = _mm_add_epi16(step1[23], step1[22]); + + step2[24] = _mm_add_epi16(step1[24], step1[25]); + step2[25] = _mm_sub_epi16(step1[24], step1[25]); + step2[26] = _mm_sub_epi16(step1[27], step1[26]); + step2[27] = _mm_add_epi16(step1[27], step1[26]); + step2[28] = _mm_add_epi16(step1[28], step1[29]); + step2[29] = _mm_sub_epi16(step1[28], step1[29]); + step2[30] = _mm_sub_epi16(step1[31], step1[30]); + step2[31] = _mm_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + idct32_135_8x32_quarter_1_2(in, temp); + idct32_135_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[2][32], io[32]; + int i; + + // rows + for (i = 0; i < 2; i++) { + load_transpose_16bit_8x8(&input[0], 32, &io[0]); + load_transpose_16bit_8x8(&input[8], 32, &io[8]); + idct32_135_8x32_ssse3(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + idct32_135_8x32_ssse3(io, io); + store_buffer_8x32(io, dest, stride); + dest += 8; + } +} diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h new file mode 100644 index 0000000000..e785c8eda1 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_ +#define VPX_DSP_X86_INV_TXFM_SSSE3_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) { + const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64); + const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64); + const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64); + const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64)); + const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64)); + const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64)); + const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64)); + const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64)); + const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64)); + const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64)); + __m128i step1[8], step2[8], tmp[4]; + + // pass 1 + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + tmp[0] = _mm_unpacklo_epi64(io[0], io[0]); + tmp[1] = _mm_unpackhi_epi64(io[0], io[0]); + tmp[2] = _mm_unpacklo_epi64(io[1], io[1]); + tmp[3] = _mm_unpackhi_epi64(io[1], io[1]); + step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7 + step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6 + + // stage 2 + step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1 + step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6 + + // stage 3 + tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]); + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6 + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + // pass 2 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + + // stage 1 + step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d); + step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d); + step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d); + step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d); + + // stage 2 + step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0] + step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d); + step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + + // stage 4 + io[0] = _mm_add_epi16(step1[0], step2[7]); + io[1] = _mm_add_epi16(step1[1], step1[6]); + io[2] = _mm_add_epi16(step1[2], step1[5]); + io[3] = _mm_add_epi16(step1[3], step2[4]); + io[4] = _mm_sub_epi16(step1[3], step2[4]); + io[5] = _mm_sub_epi16(step1[2], step1[5]); + io[6] = _mm_sub_epi16(step1[1], step1[6]); + io[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out); + +#endif // VPX_DSP_X86_INV_TXFM_SSSE3_H_ diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm deleted file mode 100644 index 20baf820f6..0000000000 --- a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ /dev/null @@ -1,1793 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the inverse transformation. Part -; of the functions are originally derived from the ffmpeg project. -; Note that the current version applies to x86 64-bit only. - -SECTION_RODATA - -pw_11585x2: times 8 dw 23170 - -pw_m2404x2: times 8 dw -2404*2 -pw_m4756x2: times 8 dw -4756*2 -pw_m5520x2: times 8 dw -5520*2 -pw_m8423x2: times 8 dw -8423*2 -pw_m9102x2: times 8 dw -9102*2 -pw_m10394x2: times 8 dw -10394*2 -pw_m11003x2: times 8 dw -11003*2 - -pw_16364x2: times 8 dw 16364*2 -pw_16305x2: times 8 dw 16305*2 -pw_16207x2: times 8 dw 16207*2 -pw_16069x2: times 8 dw 16069*2 -pw_15893x2: times 8 dw 15893*2 -pw_15679x2: times 8 dw 15679*2 -pw_15426x2: times 8 dw 15426*2 -pw_15137x2: times 8 dw 15137*2 -pw_14811x2: times 8 dw 14811*2 -pw_14449x2: times 8 dw 14449*2 -pw_14053x2: times 8 dw 14053*2 -pw_13623x2: times 8 dw 13623*2 -pw_13160x2: times 8 dw 13160*2 -pw_12665x2: times 8 dw 12665*2 -pw_12140x2: times 8 dw 12140*2 -pw__9760x2: times 8 dw 9760*2 -pw__7723x2: times 8 dw 7723*2 -pw__7005x2: times 8 dw 7005*2 -pw__6270x2: times 8 dw 6270*2 -pw__3981x2: times 8 dw 3981*2 -pw__3196x2: times 8 dw 3196*2 -pw__1606x2: times 8 dw 1606*2 -pw___804x2: times 8 dw 804*2 - -pd_8192: times 4 dd 8192 -pw_32: times 8 dw 32 -pw_16: times 8 dw 16 - -%macro TRANSFORM_COEFFS 2 -pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 -pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 -pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2 -%endmacro - -TRANSFORM_COEFFS 6270, 15137 -TRANSFORM_COEFFS 3196, 16069 -TRANSFORM_COEFFS 13623, 9102 - -; constants for 32x32_34 -TRANSFORM_COEFFS 804, 16364 -TRANSFORM_COEFFS 15426, 5520 -TRANSFORM_COEFFS 3981, 15893 -TRANSFORM_COEFFS 16207, 2404 -TRANSFORM_COEFFS 1606, 16305 -TRANSFORM_COEFFS 15679, 4756 -TRANSFORM_COEFFS 11585, 11585 - -; constants for 32x32_1024 -TRANSFORM_COEFFS 12140, 11003 -TRANSFORM_COEFFS 7005, 14811 -TRANSFORM_COEFFS 14053, 8423 -TRANSFORM_COEFFS 9760, 13160 -TRANSFORM_COEFFS 12665, 10394 -TRANSFORM_COEFFS 7723, 14449 - -%macro PAIR_PP_COEFFS 2 -dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MP_COEFFS 2 -dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MM_COEFFS 2 -dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 -%endmacro - -PAIR_PP_COEFFS 30274, 12540 -PAIR_PP_COEFFS 6392, 32138 -PAIR_MP_COEFFS 18204, 27246 - -PAIR_PP_COEFFS 12540, 12540 -PAIR_PP_COEFFS 30274, 30274 -PAIR_PP_COEFFS 6392, 6392 -PAIR_PP_COEFFS 32138, 32138 -PAIR_MM_COEFFS 18204, 18204 -PAIR_PP_COEFFS 27246, 27246 - -SECTION .text - -%if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro IDCT8_1D 0 - SUM_SUB 0, 4, 9 - BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 - pmulhrsw m0, m12 - pmulhrsw m4, m12 - BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 - - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - SUM_SUB 0, 6, 9 - SUM_SUB 4, 2, 9 - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 4, 3, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 4 -%endmacro - -; This macro handles 8 pixels per line -%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero - paddw m%1, m11 - paddw m%2, m11 - psraw m%1, 5 - psraw m%2, 5 - - movh m%3, [outputq] - movh m%4, [outputq + strideq] - punpcklbw m%3, m%5 - punpcklbw m%4, m%5 - paddw m%3, m%1 - paddw m%4, m%2 - packuswb m%3, m%5 - packuswb m%4, m%5 - movh [outputq], m%3 - movh [outputq + strideq], m%4 -%endmacro - -INIT_XMM ssse3 -; full inverse 8x8 2D-DCT transform -cglobal idct8x8_64_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] - mova m4, [inputq + 128] - packssdw m4, [inputq + 144] - mova m5, [inputq + 160] - packssdw m5, [inputq + 176] - mova m6, [inputq + 192] - packssdw m6, [inputq + 208] - mova m7, [inputq + 224] - packssdw m7, [inputq + 240] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] - mova m4, [inputq + 64] - mova m5, [inputq + 80] - mova m6, [inputq + 96] - mova m7, [inputq + 112] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero -cglobal idct8x8_12_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] - -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] -%endif - - punpcklwd m0, m1 - punpcklwd m2, m3 - punpckhdq m9, m0, m2 - punpckldq m0, m2 - SWAP 2, 9 - - ; m0 -> [0], [0] - ; m1 -> [1], [1] - ; m2 -> [2], [2] - ; m3 -> [3], [3] - punpckhqdq m10, m0, m0 - punpcklqdq m0, m0 - punpckhqdq m9, m2, m2 - punpcklqdq m2, m2 - SWAP 1, 10 - SWAP 3, 9 - - pmulhrsw m0, m12 - pmulhrsw m2, [dpw_30274_12540] - pmulhrsw m1, [dpw_6392_32138] - pmulhrsw m3, [dpw_m18204_27246] - - SUM_SUB 0, 2, 9 - SUM_SUB 1, 3, 9 - - punpcklqdq m9, m3, m3 - punpckhqdq m5, m3, m9 - - SUM_SUB 3, 5, 9 - punpckhqdq m5, m3 - pmulhrsw m5, m12 - - punpckhqdq m9, m1, m5 - punpcklqdq m1, m5 - SWAP 5, 9 - - SUM_SUB 0, 5, 9 - SUM_SUB 2, 1, 9 - - punpckhqdq m3, m0, m0 - punpckhqdq m4, m1, m1 - punpckhqdq m6, m5, m5 - punpckhqdq m7, m2, m2 - - punpcklwd m0, m3 - punpcklwd m7, m2 - punpcklwd m1, m4 - punpcklwd m6, m5 - - punpckhdq m4, m0, m7 - punpckldq m0, m7 - punpckhdq m10, m1, m6 - punpckldq m5, m1, m6 - - punpckhqdq m1, m0, m5 - punpcklqdq m0, m5 - punpckhqdq m3, m4, m10 - punpcklqdq m2, m4, m10 - - - pmulhrsw m0, m12 - pmulhrsw m6, m2, [dpw_30274_30274] - pmulhrsw m4, m2, [dpw_12540_12540] - - pmulhrsw m7, m1, [dpw_32138_32138] - pmulhrsw m1, [dpw_6392_6392] - pmulhrsw m5, m3, [dpw_m18204_m18204] - pmulhrsw m3, [dpw_27246_27246] - - mova m2, m0 - SUM_SUB 0, 6, 9 - SUM_SUB 2, 4, 9 - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 2, 3, 9 - SUM_SUB 4, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 2 - SWAP 2, 4 - - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -%define idx0 16 * 0 -%define idx1 16 * 1 -%define idx2 16 * 2 -%define idx3 16 * 3 -%define idx4 16 * 4 -%define idx5 16 * 5 -%define idx6 16 * 6 -%define idx7 16 * 7 -%define idx8 16 * 0 -%define idx9 16 * 1 -%define idx10 16 * 2 -%define idx11 16 * 3 -%define idx12 16 * 4 -%define idx13 16 * 5 -%define idx14 16 * 6 -%define idx15 16 * 7 -%define idx16 16 * 0 -%define idx17 16 * 1 -%define idx18 16 * 2 -%define idx19 16 * 3 -%define idx20 16 * 4 -%define idx21 16 * 5 -%define idx22 16 * 6 -%define idx23 16 * 7 -%define idx24 16 * 0 -%define idx25 16 * 1 -%define idx26 16 * 2 -%define idx27 16 * 3 -%define idx28 16 * 4 -%define idx29 16 * 5 -%define idx30 16 * 6 -%define idx31 16 * 7 - -; FROM idct32x32_add_neon.asm -; -; Instead of doing the transforms stage by stage, it is done by loading -; some input values and doing as many stages as possible to minimize the -; storing/loading of intermediate results. To fit within registers, the -; final coefficients are cut into four blocks: -; BLOCK A: 16-19,28-31 -; BLOCK B: 20-23,24-27 -; BLOCK C: 8-11,12-15 -; BLOCK D: 0-3,4-7 -; Blocks A and C are straight calculation through the various stages. In -; block B, further calculations are performed using the results from -; block A. In block D, further calculations are performed using the results -; from block C and then the final calculations are done using results from -; block A and B which have been combined at the end of block B. -; - -%macro IDCT32X32_34 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, m1 - pmulhrsw m1, [pw___804x2] ; stp1_16 - mova [r4 + 0], m0 - pmulhrsw m11, [pw_16364x2] ; stp2_31 - mova [r4 + 16 * 2], m2 - mova m12, m7 - pmulhrsw m7, [pw_15426x2] ; stp1_28 - mova [r4 + 16 * 4], m4 - pmulhrsw m12, [pw_m5520x2] ; stp2_19 - mova [r4 + 16 * 6], m6 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m2, m1 ; stp1_16 - mova m0, m11 ; stp1_31 - mova m4, m7 ; stp1_28 - mova m15, m12 ; stp1_19 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 15, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m6, m5 - pmulhrsw m5, [pw__3981x2] ; stp1_20 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m15 - pmulhrsw m6, [pw_15893x2] ; stp2_27 - mova [stp + %4 + idx30], m2 - mova m2, m3 - pmulhrsw m3, [pw_m2404x2] ; stp1_23 - mova [stp + %4 + idx31], m11 - pmulhrsw m2, [pw_16207x2] ; stp2_24 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m13, m5 ; stp1_20 - mova m14, m6 ; stp1_27 - mova m15, m3 ; stp1_23 - mova m11, m2 ; stp1_24 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 15, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 11, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 0, 15, 9 ; stp2_17, stp2_22 - SUM_SUB 4, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 7, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m10, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 10, 11, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m10 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 11, 15, 9 - pmulhrsw m11, m10 ; stp1_25 - pmulhrsw m15, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_24 - pmulhrsw m3, m10 ; stp1_23 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 11, 15 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m11 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m6, [rsp + transposed_in + 16 * 6] - - mova m1, m0 - pmulhrsw m0, [pw__1606x2] ; stp1_8 - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - pmulhrsw m1, [pw_16305x2] ; stp2_15 - mova [stp + %3 + idx22], m15 - mova m7, m6 - pmulhrsw m7, [pw_m4756x2] ; stp2_11 - mova [stp + %3 + idx23], m3 - pmulhrsw m6, [pw_15679x2] ; stp1_12 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m3, m0 ; stp1_8 - mova m2, m1 ; stp1_15 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - mova m4, m7 ; stp1_11 - mova m5, m6 ; stp1_12 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, m11 - pmulhrsw m11, [pw__3196x2] ; stp1_4 - pmulhrsw m12, [pw_16069x2] ; stp1_7 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m10, [pw_11585x2] - pmulhrsw m0, m10 ; stp1_1 - - mova m14, m11 ; stp1_4 - mova m13, m12 ; stp1_7 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - mova m7, m0 ; stp1_0 = stp1_1 - mova m4, m0 ; stp1_1 - mova m2, m7 ; stp1_0 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 7, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 4, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 1, 9 ; stp1_0, stp1_15 - SUM_SUB 7, 3, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 4, 6, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m15, [stp + %4 + idx30] - mova m10, [stp + %4 + idx31] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 7, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m7 - mova [stp + %4 + idx30], m15 - mova [stp + %4 + idx31], m10 - mova m7, [stp + %4 + idx28] - mova m0, [stp + %4 + idx29] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 4, 7, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m4 - mova [stp + %4 + idx28], m7 - mova [stp + %4 + idx29], m0 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m4, [stp + %3 + idx19] - SUM_SUB 1, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 3, 7, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 6, 4, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m4 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m0, [stp + %4 + idx27] - mova m1, [stp + %4 + idx26] - mova m2, [stp + %4 + idx25] - mova m3, [stp + %4 + idx24] - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - mova [stp + %4 + idx27], m0 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx24], m3 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -%macro RECON_AND_STORE 1 - mova m11, [pw_32] - lea stp, [rsp + %1] - mov r6, 32 - pxor m8, m8 -%%recon_and_store: - mova m0, [stp + 16 * 32 * 0] - mova m1, [stp + 16 * 32 * 1] - mova m2, [stp + 16 * 32 * 2] - mova m3, [stp + 16 * 32 * 3] - add stp, 16 - - paddw m0, m11 - paddw m1, m11 - paddw m2, m11 - paddw m3, m11 - psraw m0, 6 - psraw m1, 6 - psraw m2, 6 - psraw m3, 6 - movh m4, [outputq + 0] - movh m5, [outputq + 8] - movh m6, [outputq + 16] - movh m7, [outputq + 24] - punpcklbw m4, m8 - punpcklbw m5, m8 - punpcklbw m6, m8 - punpcklbw m7, m8 - paddw m0, m4 - paddw m1, m5 - paddw m2, m6 - paddw m3, m7 - packuswb m0, m1 - packuswb m2, m3 - mova [outputq + 0], m0 - mova [outputq + 16], m2 - lea outputq, [outputq + strideq] - dec r6 - jnz %%recon_and_store -%endmacro - -%define i32x32_size 16*32*5 -%define pass_two_start 16*32*0 -%define transposed_in 16*32*4 -%define pass_one_start 16*32*0 -%define stp r8 - -INIT_XMM ssse3 -cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - lea stp, [rsp + pass_one_start] - -idct32x32_34: - mov r3, inputq - lea r4, [rsp + transposed_in] - -idct32x32_34_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - IDCT32X32_34 16*0, 16*32, 16*64, 16*96 - lea stp, [stp + 16 * 8] - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_34_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - -idct32x32_34_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - IDCT32X32_34 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_34_2 - - RECON_AND_STORE pass_two_start - - RET - -%macro IDCT32X32_135 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m1, [rsp + transposed_in + 16 * 1] - mova m11, m1 - pmulhrsw m1, [pw___804x2] ; stp1_16 - pmulhrsw m11, [pw_16364x2] ; stp2_31 - - mova m7, [rsp + transposed_in + 16 * 7] - mova m12, m7 - pmulhrsw m7, [pw_15426x2] ; stp1_28 - pmulhrsw m12, [pw_m5520x2] ; stp2_19 - - mova m3, [rsp + transposed_in + 16 * 9] - mova m4, m3 - pmulhrsw m3, [pw__7005x2] ; stp1_18 - pmulhrsw m4, [pw_14811x2] ; stp2_29 - - mova m0, [rsp + transposed_in + 16 * 15] - mova m2, m0 - pmulhrsw m0, [pw_12140x2] ; stp1_30 - pmulhrsw m2, [pw_m11003x2] ; stp2_17 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 - SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 - SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 - SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m3 - mova [stp + %4 + idx30], m2 - mova [stp + %4 + idx31], m11 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m2, [rsp + transposed_in + 16 * 3] - mova m3, m2 - pmulhrsw m3, [pw_m2404x2] ; stp1_23 - pmulhrsw m2, [pw_16207x2] ; stp2_24 - - mova m5, [rsp + transposed_in + 16 * 5] - mova m6, m5 - pmulhrsw m5, [pw__3981x2] ; stp1_20 - pmulhrsw m6, [pw_15893x2] ; stp2_27 - - mova m14, [rsp + transposed_in + 16 * 11] - mova m13, m14 - pmulhrsw m13, [pw_m8423x2] ; stp1_21 - pmulhrsw m14, [pw_14053x2] ; stp2_26 - - mova m0, [rsp + transposed_in + 16 * 13] - mova m1, m0 - pmulhrsw m0, [pw__9760x2] ; stp1_22 - pmulhrsw m1, [pw_13160x2] ; stp2_25 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 - SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 - SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 - SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m11, [stp + %3 + idx18] - mova m12, [stp + %3 + idx19] - SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 - SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m4 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m11 - mova [stp + %3 + idx19], m12 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m11, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m11 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 1, 0, 9 - pmulhrsw m1, m10 ; stp1_25 - pmulhrsw m0, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_25 - pmulhrsw m3, m10 ; stp1_22 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 1, 0 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - mova [stp + %3 + idx22], m0 - mova [stp + %3 + idx23], m3 - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m1 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m1, m0 - pmulhrsw m0, [pw__1606x2] ; stp1_8 - pmulhrsw m1, [pw_16305x2] ; stp2_15 - - mova m6, [rsp + transposed_in + 16 * 6] - mova m7, m6 - pmulhrsw m7, [pw_m4756x2] ; stp2_11 - pmulhrsw m6, [pw_15679x2] ; stp1_12 - - mova m4, [rsp + transposed_in + 16 * 10] - mova m5, m4 - pmulhrsw m4, [pw__7723x2] ; stp1_10 - pmulhrsw m5, [pw_14449x2] ; stp2_13 - - mova m2, [rsp + transposed_in + 16 * 14] - mova m3, m2 - pmulhrsw m3, [pw_m10394x2] ; stp1_9 - pmulhrsw m2, [pw_12665x2] ; stp2_14 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 - SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 - SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 - SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, m11 - pmulhrsw m11, [pw__3196x2] ; stp1_4 - pmulhrsw m12, [pw_16069x2] ; stp1_7 - - mova m13, [rsp + transposed_in + 16 * 12] - mova m14, m13 - pmulhrsw m13, [pw_13623x2] ; stp1_6 - pmulhrsw m14, [pw_m9102x2] ; stp1_5 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m2, [rsp + transposed_in + 16 * 8] - pmulhrsw m0, [pw_11585x2] ; stp1_1 - mova m3, m2 - pmulhrsw m2, [pw__6270x2] ; stp1_2 - pmulhrsw m3, [pw_15137x2] ; stp1_3 - - SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 - SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - mova m1, m0 ; stp1_0 = stp1_1 - SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 - SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %2 + idx12] - mova m5, [stp + %2 + idx13] - mova m6, [stp + %2 + idx14] - mova m7, [stp + %2 + idx15] - SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 - SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m10, [stp + %4 + idx31] - mova m15, [stp + %4 + idx30] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m1 - mova [stp + %4 + idx31], m10 - mova [stp + %4 + idx30], m15 - mova m0, [stp + %4 + idx29] - mova m1, [stp + %4 + idx28] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m3 - mova [stp + %4 + idx29], m0 - mova [stp + %4 + idx28], m1 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m1, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m3, [stp + %3 + idx19] - SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m4 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m6 - mova [stp + %2 + idx15], m7 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m1 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m3 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m3, [stp + %4 + idx24] - mova m2, [stp + %4 + idx25] - mova m1, [stp + %4 + idx26] - mova m0, [stp + %4 + idx27] - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - mova [stp + %4 + idx24], m3 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx27], m0 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -INIT_XMM ssse3 -cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - mov r6, 2 - lea stp, [rsp + pass_one_start] - -idct32x32_135: - mov r3, inputq - lea r4, [rsp + transposed_in] - mov r7, 2 - -idct32x32_135_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - -%if CONFIG_VP9_HIGHBITDEPTH - add r3, 32 -%else - add r3, 16 -%endif - add r4, 16 * 8 - dec r7 - jne idct32x32_135_transpose - - IDCT32X32_135 16*0, 16*32, 16*64, 16*96 - lea stp, [stp + 16 * 8] -%if CONFIG_VP9_HIGHBITDEPTH - lea inputq, [inputq + 32 * 32] -%else - lea inputq, [inputq + 16 * 32] -%endif - dec r6 - jnz idct32x32_135 - - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_135_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - mov r7, 2 - -idct32x32_135_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - - add r3, 16 * 8 - add r4, 16 * 8 - dec r7 - jne idct32x32_135_transpose_2 - - IDCT32X32_135 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_135_2 - - RECON_AND_STORE pass_two_start - - RET - -%macro IDCT32X32_1024 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m1, [rsp + transposed_in + 16 * 1] - mova m11, [rsp + transposed_in + 16 * 31] - BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31 - - mova m0, [rsp + transposed_in + 16 * 15] - mova m2, [rsp + transposed_in + 16 * 17] - BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30 - - mova m7, [rsp + transposed_in + 16 * 7] - mova m12, [rsp + transposed_in + 16 * 25] - BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28 - - mova m3, [rsp + transposed_in + 16 * 9] - mova m4, [rsp + transposed_in + 16 * 23] - BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 - SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 - SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 - SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m3 - mova [stp + %4 + idx30], m2 - mova [stp + %4 + idx31], m11 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m5, [rsp + transposed_in + 16 * 5] - mova m6, [rsp + transposed_in + 16 * 27] - BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27 - - mova m13, [rsp + transposed_in + 16 * 21] - mova m14, [rsp + transposed_in + 16 * 11] - BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26 - - mova m0, [rsp + transposed_in + 16 * 13] - mova m1, [rsp + transposed_in + 16 * 19] - BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25 - - mova m2, [rsp + transposed_in + 16 * 3] - mova m3, [rsp + transposed_in + 16 * 29] - BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 - SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 - SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 - SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m11, [stp + %3 + idx18] - mova m12, [stp + %3 + idx19] - SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 - SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m4 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m11 - mova [stp + %3 + idx19], m12 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m11, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m11 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 1, 0, 9 - pmulhrsw m1, m10 ; stp1_25 - pmulhrsw m0, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_25 - pmulhrsw m3, m10 ; stp1_22 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 1, 0 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - mova [stp + %3 + idx22], m0 - mova [stp + %3 + idx23], m3 - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m1 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m1, [rsp + transposed_in + 16 * 30] - BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15 - - mova m2, [rsp + transposed_in + 16 * 14] - mova m3, [rsp + transposed_in + 16 * 18] - BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14 - - mova m4, [rsp + transposed_in + 16 * 10] - mova m5, [rsp + transposed_in + 16 * 22] - BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13 - - mova m6, [rsp + transposed_in + 16 * 6] - mova m7, [rsp + transposed_in + 16 * 26] - BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 - SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 - SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 - SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, [rsp + transposed_in + 16 * 28] - BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7 - - mova m13, [rsp + transposed_in + 16 * 12] - mova m14, [rsp + transposed_in + 16 * 20] - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m1, [rsp + transposed_in + 16 * 16] - -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 0, 1, 9 - pmulhrsw m0, m10 ; stp1_1 - pmulhrsw m1, m10 ; stp1_0 -%else - BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0 - SWAP 0, 1 -%endif - mova m2, [rsp + transposed_in + 16 * 8] - mova m3, [rsp + transposed_in + 16 * 24] - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3 - - mova m10, [pw_11585x2] - SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 - SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 - SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %2 + idx12] - mova m5, [stp + %2 + idx13] - mova m6, [stp + %2 + idx14] - mova m7, [stp + %2 + idx15] - SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 - SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m10, [stp + %4 + idx31] - mova m15, [stp + %4 + idx30] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m1 - mova [stp + %4 + idx31], m10 - mova [stp + %4 + idx30], m15 - mova m0, [stp + %4 + idx29] - mova m1, [stp + %4 + idx28] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m3 - mova [stp + %4 + idx29], m0 - mova [stp + %4 + idx28], m1 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m1, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m3, [stp + %3 + idx19] - SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m4 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m6 - mova [stp + %2 + idx15], m7 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m1 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m3 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m3, [stp + %4 + idx24] - mova m2, [stp + %4 + idx25] - mova m1, [stp + %4 + idx26] - mova m0, [stp + %4 + idx27] - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - mova [stp + %4 + idx24], m3 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx27], m0 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -INIT_XMM ssse3 -cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - mov r6, 4 - lea stp, [rsp + pass_one_start] - -idct32x32_1024: - mov r3, inputq - lea r4, [rsp + transposed_in] - mov r7, 4 - -idct32x32_1024_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 -%if CONFIG_VP9_HIGHBITDEPTH - add r3, 32 -%else - add r3, 16 -%endif - add r4, 16 * 8 - dec r7 - jne idct32x32_1024_transpose - - IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 - - lea stp, [stp + 16 * 8] -%if CONFIG_VP9_HIGHBITDEPTH - lea inputq, [inputq + 32 * 32] -%else - lea inputq, [inputq + 16 * 32] -%endif - dec r6 - jnz idct32x32_1024 - - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_1024_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - mov r7, 4 - -idct32x32_1024_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - - add r3, 16 * 8 - add r4, 16 * 8 - dec r7 - jne idct32x32_1024_transpose_2 - - IDCT32X32_1024 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_1024_2 - - RECON_AND_STORE pass_two_start - - RET -%endif diff --git a/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm index fbbcd76bd7..bcf1a6ef98 100644 --- a/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm @@ -9,6 +9,7 @@ ; %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text @@ -82,15 +83,8 @@ SECTION .text INIT_XMM sse2 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] -%endif + LOAD_TRAN_LOW 0, inputq, 0 + LOAD_TRAN_LOW 1, inputq, 8 psraw m0, 2 psraw m1, 2 diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c index 85923b4478..6652a62dcf 100644 --- a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c +++ b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c @@ -13,10 +13,10 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -367,7 +367,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; -void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, +void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c index e13334ae0e..28e6fd65f9 100644 --- a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c @@ -229,10 +229,10 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); } -void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -591,7 +591,7 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, +void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { @@ -1745,7 +1745,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, transpose(src, p, dst, 8, 2); // Loop filtering - vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); + vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); src[0] = t_dst; src[1] = t_dst + 8 * 8; @@ -1766,7 +1766,7 @@ void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); // Loop filtering - vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); + vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); // Transpose back transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); diff --git a/libs/libvpx/vpx_dsp/x86/mem_sse2.h b/libs/libvpx/vpx_dsp/x86/mem_sse2.h new file mode 100644 index 0000000000..2ce738fb77 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/mem_sse2.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_MEM_SSE2_H_ +#define VPX_DSP_X86_MEM_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" + +static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); + d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); + d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); + d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); +} + +static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_4x4(s + 0 * stride, stride, &d[0]); + load_8bit_4x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); +} + +static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_8x4(s + 0 * stride, stride, &d[0]); + load_8bit_8x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); + d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); + d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); + d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); + d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); +} + +static INLINE void loadu_8bit_16x4(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); +} + +static INLINE void loadu_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); + loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) { + _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); +} + +static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); + *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); + *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); + *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); +} + +static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, + const ptrdiff_t stride) { + __m128i ss[4]; + + ss[0] = s; + ss[1] = _mm_srli_si128(s, 4); + ss[2] = _mm_srli_si128(s, 8); + ss[3] = _mm_srli_si128(s, 12); + store_8bit_4x4(ss, d, stride); +} + +static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s, + uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); + _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); +} + +static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); + _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); + _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); + _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); + _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); + _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); +} + +static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); + _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); + _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); + _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); +} + +#endif // VPX_DSP_X86_MEM_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/quantize_avx.c b/libs/libvpx/vpx_dsp/x86/quantize_avx.c new file mode 100644 index 0000000000..6f4489004d --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/quantize_avx.c @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#if defined(_MSC_VER) +#include +#endif +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_x86.h" + +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + *eob_ptr = 0; + + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (n_coeffs == 16) return; + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, + zero); + } + + // AC only loop. + for (index = 16; index < n_coeffs; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_avx( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + (void)scan_ptr; + (void)n_coeffs; + (void)skip_block; + assert(!skip_block); + + // Setup global values. + // The 32x32 halves zbin and round. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + // Shift with rounding. + zbin = _mm_add_epi16(zbin, one); + zbin = _mm_srli_epi16(zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + zbin = _mm_sub_epi16(zbin, one); + + round = _mm_load_si128((const __m128i *)round_ptr); + round = _mm_add_epi16(round, one); + round = _mm_srli_epi16(round, 1); + + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + shift = _mm_slli_epi16(shift, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Un-sign to bias rounding like C. + // dequant is almost always negative, so this is probably the backwards way + // to handle the sign. However, it matches the previous assembly. + coeff0 = _mm_abs_epi16(qcoeff0); + coeff1 = _mm_abs_epi16(qcoeff1); + + coeff0 = calculate_dqcoeff(coeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(coeff1, dequant); + + // "Divide" by 2. + coeff0 = _mm_srli_epi16(coeff0, 1); + coeff1 = _mm_srli_epi16(coeff1, 1); + + coeff0 = _mm_sign_epi16(coeff0, qcoeff0); + coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, + zero); + } + + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = _mm_abs_epi16(qcoeff0); + coeff1 = _mm_abs_epi16(qcoeff1); + + coeff0 = calculate_dqcoeff(coeff0, dequant); + coeff1 = calculate_dqcoeff(coeff1, dequant); + + coeff0 = _mm_srli_epi16(coeff0, 1); + coeff1 = _mm_srli_epi16(coeff1, 1); + + coeff0 = _mm_sign_epi16(coeff0, qcoeff0); + coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm deleted file mode 100644 index 01c41291be..0000000000 --- a/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm +++ /dev/null @@ -1,544 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - - vzeroupper - - ; If we can skip this block, then just zero the output - cmp skipmp, 0 - jne .blank - -%ifnidn %1, b_32x32 - - ; Special case for ncoeff == 16, as it is frequent and we can save on - ; not setting up a loop. - cmp ncoeffmp, 16 - jne .generic - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Special case of ncoeff == 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.single: - - movifnidn coeffq, coeffmp - movifnidn zbinq, zbinmp - mova m0, [zbinq] ; m0 = zbin - - ; Get DC and first 15 AC coeffs - in this special case, that is all. -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers but we process them as 16 bit numbers - mova m9, [coeffq] - packssdw m9, [coeffq+16] ; m9 = c[i] - mova m10, [coeffq+32] - packssdw m10, [coeffq+48] ; m10 = c[i] -%else - mova m9, [coeffq] ; m9 = c[i] - mova m10, [coeffq+16] ; m10 = c[i] -%endif - - mov r0, eobmp ; Output pointer - mov r1, qcoeffmp ; Output pointer - mov r2, dqcoeffmp ; Output pointer - - pxor m5, m5 ; m5 = dedicated zero - - pcmpeqw m4, m4 ; All word lanes -1 - paddw m0, m4 ; m0 = zbin - 1 - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, we just write zeros - ; to the outputs and we are done. - por m14, m7, m12 - ptest m14, m14 - jnz .single_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [r1 ], ymm5 - mova [r1+32], ymm5 - mova [r2 ], ymm5 - mova [r2+32], ymm5 -%else - mova [r1], ymm5 - mova [r2], ymm5 -%endif - mov [r0], word 0 - - vzeroupper - RET - -.single_nonzero: - - ; Actual quantization of size 16 block - setup pointers, rounders, etc. - movifnidn r4, roundmp - movifnidn r5, quantmp - mov r3, dequantmp - mov r6, shiftmp - mova m1, [r4] ; m1 = round - mova m2, [r5] ; m2 = quant - mova m3, [r3] ; m3 = dequant - mova m4, [r6] ; m4 = shift - - mov r3, iscanmp - - DEFINE_ARGS eob, qcoeff, dqcoeff, iscan - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [qcoeffq ], m11 - mova [qcoeffq+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+32], m11 - mova [qcoeffq+48], m6 -%else - mova [qcoeffq ], m8 - mova [qcoeffq+16], m13 -%endif - - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q - -%if CONFIG_VP9_HIGHBITDEPTH - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [dqcoeffq ], m11 - mova [dqcoeffq+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+32], m11 - mova [dqcoeffq+48], m6 -%else - mova [dqcoeffq ], m8 - mova [dqcoeffq+16], m13 -%endif - - mova m6, [iscanq] ; m6 = scan[i] - mova m11, [iscanq+16] ; m11 = scan[i] - - pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 - psubw m6, m6, m7 ; m6 = scan[i] + 1 - psubw m11, m11, m12 ; m11 = scan[i] + 1 - pandn m8, m8, m6 ; m8 = max(eob) - pandn m13, m13, m11 ; m13 = max(eob) - pmaxsw m8, m8, m13 - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [eobq], ax - - vzeroupper - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of ncoeff != 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.generic: - -%endif ; %ifnidn %1, b_32x32 - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - ; Actual quantization loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant - mova m3, [r2] ; m3 = dequant - pcmpeqw m4, m4 ; All lanes -1 -%ifidn %1, b_32x32 - psubw m0, m4 - psubw m1, m4 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - paddw m0, m4 ; m0 = m0 + 1 - - mov r2, shiftmp - mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob - -%if CONFIG_VP9_HIGHBITDEPTH - lea coeffq, [ coeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif - lea iscanq, [ iscanq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers & require 16bit numbers - mova m9, [coeffq+ncoeffq*4+ 0] - packssdw m9, [coeffq+ncoeffq*4+16] - mova m10, [coeffq+ncoeffq*4+32] - packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, skip forward quickly. - por m14, m7, m12 - ptest m14, m14 - jnz .first_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4 ], ymm5 - mova [qcoeffq+ncoeffq*4+32], ymm5 - mova [dqcoeffq+ncoeffq*4 ], ymm5 - mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2], ymm5 - mova [dqcoeffq+ncoeffq*2], ymm5 -%endif - - add ncoeffq, mmsize - - punpckhqdq m1, m1 - punpckhqdq m2, m2 - punpckhqdq m3, m3 - punpckhqdq m4, m4 - pxor m8, m8 - - jmp .ac_only_loop - -.first_nonzero: - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif - -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] - mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - -.ac_only_loop: - -%if CONFIG_VP9_HIGHBITDEPTH - ; pack coeff from 32bit to 16bit array - mova m9, [coeffq+ncoeffq*4+ 0] - packssdw m9, [coeffq+ncoeffq*4+16] - mova m10, [coeffq+ncoeffq*4+32] - packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, skip this itertion. - ; And just write zeros as the result would be. - por m14, m7, m12 - ptest m14, m14 - jnz .rest_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4+ 0], ymm5 - mova [qcoeffq+ncoeffq*4+32], ymm5 - mova [dqcoeffq+ncoeffq*4+ 0], ymm5 - mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2+ 0], ymm5 - mova [dqcoeffq+ncoeffq*2+ 0], ymm5 -%endif - add ncoeffq, mmsize - jnz .ac_only_loop - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [r2], ax - vzeroupper - RET - -.rest_nonzero: - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m14 - punpckhwd m6, m14, m6 - pmovsxwd m11, m14 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif - -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m14 - punpckhwd m6, m14, m6 - pmovsxwd m11, m14 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jnz .ac_only_loop - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [r2], ax - vzeroupper - RET - - ; Skip-block, i.e. just write all zeroes -.blank: - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - -DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - -%if CONFIG_VP9_HIGHBITDEPTH - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif - - neg ncoeffq - pxor m7, m7 - -.blank_loop: -%if CONFIG_VP9_HIGHBITDEPTH - mova [dqcoeffq+ncoeffq*4+ 0], ymm7 - mova [dqcoeffq+ncoeffq*4+32], ymm7 - mova [qcoeffq+ncoeffq*4+ 0], ymm7 - mova [qcoeffq+ncoeffq*4+32], ymm7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], ymm7 - mova [qcoeffq+ncoeffq*2+ 0], ymm7 -%endif - add ncoeffq, mmsize - jl .blank_loop - - mov [eobq], word 0 - - vzeroupper - RET -%endmacro - -INIT_XMM avx -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 - -END diff --git a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c index 2c7e431c74..c020b398c3 100644 --- a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -8,37 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" - -static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { -#if CONFIG_VP9_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -#else - return _mm_load_si128((const __m128i *)coeff_ptr); -#endif -} - -static INLINE void store_coefficients(__m128i coeff_vals, - tran_low_t *coeff_ptr) { -#if CONFIG_VP9_HIGHBITDEPTH - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -#else - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); -#endif -} +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_x86.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -47,202 +24,103 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); - coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i zbin; - __m128i round, quant, dequant, shift; - { - __m128i coeff0, coeff1; + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); - // Setup global values - { - __m128i pw_1; - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - pw_1 = _mm_set1_epi16(1); - zbin = _mm_sub_epi16(zbin, pw_1); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - } + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - // Do DC and first 15 AC - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - shift = _mm_unpackhi_epi64(shift, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + calculate_qcoeff(&qcoeff0, round, quant, shift); - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + calculate_qcoeff(&qcoeff1, round, quant, shift); - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); - } + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); - } + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_coefficients(zero, dqcoeff_ptr + n_coeffs); - store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); - store_coefficients(zero, qcoeff_ptr + n_coeffs); - store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; } + + *eob_ptr = accumulate_eob(eob); } diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c new file mode 100644 index 0000000000..3f528e1a97 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_x86.h" + +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_ssse3( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + (void)scan_ptr; + (void)n_coeffs; + (void)skip_block; + assert(!skip_block); + + // Setup global values. + // The 32x32 halves zbin and round. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + // Shift with rounding. + zbin = _mm_add_epi16(zbin, one); + zbin = _mm_srli_epi16(zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + zbin = _mm_sub_epi16(zbin, one); + + round = _mm_load_si128((const __m128i *)round_ptr); + round = _mm_add_epi16(round, one); + round = _mm_srli_epi16(round, 1); + + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + shift = _mm_slli_epi16(shift, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); +#endif // CONFIG_HIGHBITDEPTH + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Un-sign to bias rounding like C. + // dequant is almost always negative, so this is probably the backwards way + // to handle the sign. However, it matches the previous assembly. + coeff0 = _mm_abs_epi16(qcoeff0); + coeff1 = _mm_abs_epi16(qcoeff1); + + coeff0 = calculate_dqcoeff(coeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(coeff1, dequant); + + // "Divide" by 2. + coeff0 = _mm_srli_epi16(coeff0, 1); + coeff1 = _mm_srli_epi16(coeff1, 1); + + coeff0 = _mm_sign_epi16(coeff0, qcoeff0); + coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, + zero); + } + + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = _mm_abs_epi16(qcoeff0); + coeff1 = _mm_abs_epi16(qcoeff1); + + coeff0 = calculate_dqcoeff(coeff0, dequant); + coeff1 = calculate_dqcoeff(coeff1, dequant); + + coeff0 = _mm_srli_epi16(coeff0, 1); + coeff1 = _mm_srli_epi16(coeff1, 1); + + coeff0 = _mm_sign_epi16(coeff0, qcoeff0); + coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + + store_tran_low(coeff0, dqcoeff_ptr + index); + store_tran_low(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm deleted file mode 100644 index ca21539173..0000000000 --- a/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm +++ /dev/null @@ -1,346 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_1: times 8 dw 1 - -SECTION .text - -; TODO(yunqingwang)fix quantize_b code for skip=1 case. -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - cmp dword skipm, 0 - jne .blank - - ; actual quantize loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant -%ifidn %1, b_32x32 - pcmpeqw m5, m5 - psrlw m5, 15 - paddw m0, m5 - paddw m1, m5 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - mova m3, [r2q] ; m3 = dequant - psubw m0, [pw_1] - mov r2, shiftmp - mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob -%if CONFIG_VP9_HIGHBITDEPTH - lea coeffq, [ coeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif - lea iscanq, [ iscanq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers & require 16bit numbers - mova m9, [ coeffq+ncoeffq*4+ 0] - packssdw m9, [ coeffq+ncoeffq*4+16] - mova m10, [ coeffq+ncoeffq*4+32] - packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m8 - mova m6, m8 - pcmpgtw m5, m8 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m8 - mova m6, m8 - pcmpgtw m5, m8 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - jz .accumulate_eob - -.ac_only_loop: -%if CONFIG_VP9_HIGHBITDEPTH - ; pack coeff from 32bit to 16bit array - mova m9, [ coeffq+ncoeffq*4+ 0] - packssdw m9, [ coeffq+ncoeffq*4+16] - mova m10, [ coeffq+ncoeffq*4+32] - packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin -%ifidn %1, b_32x32 - pmovmskb r6d, m7 - pmovmskb r2d, m12 - or r6, r2 - jz .skip_iter -%endif - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pxor m11, m11 - mova m11, m14 - mova m6, m14 - pcmpgtw m5, m14 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m14 - mova m6, m14 - pcmpgtw m5, m14 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jl .ac_only_loop - -%ifidn %1, b_32x32 - jmp .accumulate_eob -.skip_iter: -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4+ 0], m5 - mova [qcoeffq+ncoeffq*4+16], m5 - mova [qcoeffq+ncoeffq*4+32], m5 - mova [qcoeffq+ncoeffq*4+48], m5 - mova [dqcoeffq+ncoeffq*4+ 0], m5 - mova [dqcoeffq+ncoeffq*4+16], m5 - mova [dqcoeffq+ncoeffq*4+32], m5 - mova [dqcoeffq+ncoeffq*4+48], m5 -%else - mova [qcoeffq+ncoeffq*2+ 0], m5 - mova [qcoeffq+ncoeffq*2+16], m5 - mova [dqcoeffq+ncoeffq*2+ 0], m5 - mova [dqcoeffq+ncoeffq*2+16], m5 -%endif - add ncoeffq, mmsize - jl .ac_only_loop -%endif - -.accumulate_eob: - ; horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - pextrw r6, m8, 0 - mov [r2], r6 - RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob -%if CONFIG_VP9_HIGHBITDEPTH - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif - neg ncoeffq - pxor m7, m7 -.blank_loop: -%if CONFIG_VP9_HIGHBITDEPTH - mova [dqcoeffq+ncoeffq*4+ 0], m7 - mova [dqcoeffq+ncoeffq*4+16], m7 - mova [dqcoeffq+ncoeffq*4+32], m7 - mova [dqcoeffq+ncoeffq*4+48], m7 - mova [qcoeffq+ncoeffq*4+ 0], m7 - mova [qcoeffq+ncoeffq*4+16], m7 - mova [qcoeffq+ncoeffq*4+32], m7 - mova [qcoeffq+ncoeffq*4+48], m7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m7 - mova [dqcoeffq+ncoeffq*2+16], m7 - mova [qcoeffq+ncoeffq*2+ 0], m7 - mova [qcoeffq+ncoeffq*2+16], m7 -%endif - add ncoeffq, mmsize - jl .blank_loop - mov word [eobq], 0 - RET -%endmacro - -INIT_XMM ssse3 -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 diff --git a/libs/libvpx/vpx_dsp/x86/quantize_x86.h b/libs/libvpx/vpx_dsp/x86/quantize_x86.h new file mode 100644 index 0000000000..34928fbb56 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/quantize_x86.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" + +static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, + const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + const int16_t *shift_ptr, __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)zbin_ptr); + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)shift_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { + return _mm_mullo_epi16(qcoeff, dequant); +} + +// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing +// to zbin to add 1 to the index in 'scan'. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const __m128i zbin_mask0, + const __m128i zbin_mask1, + const int16_t *scan_ptr, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i eob0, eob1; + // Add one to convert from indices to counts + scan0 = _mm_sub_epi16(scan0, zbin_mask0); + scan1 = _mm_sub_epi16(scan1, zbin_mask1); + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} diff --git a/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c new file mode 100644 index 0000000000..5f2ab6ea71 --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX512 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m512i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm512_set1_epi16(0); + sum_ref1 = _mm512_set1_epi16(0); + sum_ref2 = _mm512_set1_epi16(0); + sum_ref3 = _mm512_set1_epi16(0); + for (i = 0; i < 64; i++) { + // load src and all refs + src_reg = _mm512_loadu_si512((const __m512i *)src); + ref0_reg = _mm512_loadu_si512((const __m512i *)ref0); + ref1_reg = _mm512_loadu_si512((const __m512i *)ref1); + ref2_reg = _mm512_loadu_si512((const __m512i *)ref2); + ref3_reg = _mm512_loadu_si512((const __m512i *)ref3); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m256i sum256; + __m128i sum128; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4); + sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1); + sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow), + _mm512_extracti32x8_epi32(sum_mlow, 1)); + sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256), + _mm256_extractf128_si256(sum256, 1)); + + _mm_storeu_si128((__m128i *)(res), sum128); + } +} diff --git a/libs/libvpx/vpx_dsp/x86/sad_sse3.asm b/libs/libvpx/vpx_dsp/x86/sad_sse3.asm index 18279bdb9d..175dcc0895 100644 --- a/libs/libvpx/vpx_dsp/x86/sad_sse3.asm +++ b/libs/libvpx/vpx_dsp/x86/sad_sse3.asm @@ -165,6 +165,8 @@ paddw mm7, mm3 %endmacro +SECTION .text + ;void int vpx_sad16x16x3_sse3( ; unsigned char *src_ptr, ; int src_stride, diff --git a/libs/libvpx/vpx_dsp/x86/sad_sse4.asm b/libs/libvpx/vpx_dsp/x86/sad_sse4.asm index bc67447971..03999dfca2 100644 --- a/libs/libvpx/vpx_dsp/x86/sad_sse4.asm +++ b/libs/libvpx/vpx_dsp/x86/sad_sse4.asm @@ -165,6 +165,8 @@ movdqa [rdi + 16], xmm2 %endmacro +SECTION .text + ;void vpx_sad16x16x8_sse4_1( ; const unsigned char *src_ptr, ; int src_stride, diff --git a/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm index 49f204fa04..7cf93cf51d 100644 --- a/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm +++ b/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm @@ -146,6 +146,8 @@ %endmacro +SECTION .text + ;void int vpx_sad16x16x3_ssse3( ; unsigned char *src_ptr, ; int src_stride, diff --git a/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm index 6d58321e03..300fa8aabc 100644 --- a/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm +++ b/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm @@ -44,6 +44,9 @@ paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro + +SECTION .text + ;void ssim_parms_sse2( ; unsigned char *s, ; int sp, diff --git a/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c index bc5362e10f..026d0ca2f2 100644 --- a/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c @@ -123,6 +123,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { return vpx_sum_squares_2d_i16_4x4_sse2(src, stride); } else { // Generic case + assert(size % 8 == 0); return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size); } } diff --git a/libs/libvpx/vpx_dsp/x86/transpose_sse2.h b/libs/libvpx/vpx_dsp/x86/transpose_sse2.h new file mode 100644 index 0000000000..8a0119ca7e --- /dev/null +++ b/libs/libvpx/vpx_dsp/x86/transpose_sse2.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_ +#define VPX_DSP_X86_TRANSPOSE_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" + +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 01 11 21 31 + // out[1]: 02 12 22 32 03 13 23 33 + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_unpackhi_epi32(a0, a1); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); +} + +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h index f8edb1b787..0a9542c85b 100644 --- a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h +++ b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h @@ -18,6 +18,9 @@ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + #define dual_set_epi16(a, b) \ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) diff --git a/libs/libvpx/vpx_dsp/x86/variance_avx2.c b/libs/libvpx/vpx_dsp/x86/variance_avx2.c index 8428e0520d..d15a89c746 100644 --- a/libs/libvpx/vpx_dsp/x86/variance_avx2.c +++ b/libs/libvpx/vpx_dsp/x86/variance_avx2.c @@ -7,16 +7,592 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#include // AVX2 + #include "./vpx_dsp_rtcd.h" +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; + +DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = { + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 +}; +/* clang-format on */ + +void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *sse, int *sum) { + unsigned int i, src_2strides, ref_2strides; + __m256i sum_reg = _mm256_setzero_si256(); + __m256i sse_reg = _mm256_setzero_si256(); + // process two 16 byte locations in a 256 bit register + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; ++i) { + // convert up values in 128 bit registers across lanes + const __m256i src0 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(src_ptr))); + const __m256i src1 = _mm256_cvtepu8_epi16( + _mm_loadu_si128((__m128i const *)(src_ptr + source_stride))); + const __m256i ref0 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(ref_ptr))); + const __m256i ref1 = _mm256_cvtepu8_epi16( + _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride))); + const __m256i diff0 = _mm256_sub_epi16(src0, ref0); + const __m256i diff1 = _mm256_sub_epi16(src1, ref1); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + + // add to the running totals + sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1)); + sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1)); + + src_ptr += src_2strides; + ref_ptr += ref_2strides; + } + { + // extract the low lane and add it to the high lane + const __m128i sum_reg_128 = _mm_add_epi16( + _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1)); + const __m128i sse_reg_128 = _mm_add_epi32( + _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); + + // sum upper and lower 64 bits together and convert up to 32 bit values + const __m128i sum_reg_64 = + _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + *((int *)sum) = _mm_extract_epi32(res, 1); + } +} + +static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *sse, int *sum) { + unsigned int i, src_2strides, ref_2strides; + const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2); + __m256i sum_reg = _mm256_setzero_si256(); + __m256i sse_reg = _mm256_setzero_si256(); + + // process 64 elements in an iteration + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; i++) { + const __m256i src0 = _mm256_loadu_si256((__m256i const *)(src_ptr)); + const __m256i src1 = + _mm256_loadu_si256((__m256i const *)(src_ptr + source_stride)); + const __m256i ref0 = _mm256_loadu_si256((__m256i const *)(ref_ptr)); + const __m256i ref1 = + _mm256_loadu_si256((__m256i const *)(ref_ptr + recon_stride)); + + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src0, ref0); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src0, ref0); + const __m256i src_ref2 = _mm256_unpacklo_epi8(src1, ref1); + const __m256i src_ref3 = _mm256_unpackhi_epi8(src1, ref1); + + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i diff2 = _mm256_maddubs_epi16(src_ref2, adj_sub); + const __m256i diff3 = _mm256_maddubs_epi16(src_ref3, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + const __m256i madd2 = _mm256_madd_epi16(diff2, diff2); + const __m256i madd3 = _mm256_madd_epi16(diff3, diff3); + + // add to the running totals + sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1)); + sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff2, diff3)); + sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1)); + sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd2, madd3)); + + src_ptr += src_2strides; + ref_ptr += ref_2strides; + } + + { + // extract the low lane and add it to the high lane + const __m128i sum_reg_128 = _mm_add_epi16( + _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1)); + const __m128i sse_reg_128 = _mm_add_epi32( + _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); + + // sum upper and lower 64 bits together and convert up to 32 bit values + const __m128i sum_reg_64 = + _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + *((int *)sum) = _mm_extract_epi32(res, 1); + } +} + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \ + *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src); + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +// (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction. +static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int sstep) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, sum_reg, sse_reg, src_stride); +} + +static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, sum_reg, sse_reg, 1); +} + +static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg); + prev_src_avg = src_avg; + + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg); + } + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +// (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction. +static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int offset, int sstep) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (offset << 5))); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(filter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); + sec += sec_stride; + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int y_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, sum_reg, sse_reg, y_offset, src_stride); +} + +static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int x_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, sum_reg, sse_reg, x_offset, 1); +} + +static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int y_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg); + exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg); + prev_src_avg = src_avg; + + FILTER_SRC(filter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int x_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i src_reg, src_pack; + int i; + exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); + exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(filter) + + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg); + sec += sec_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst += dst_stride; + src += src_stride; + } +} + +static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, int do_sec, + int height, __m256i *sum_reg, __m256i *sse_reg, + int x_offset, int y_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); + const __m256i yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i prev_src_pack, src_pack; + int i; + exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); + exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src += src_stride; + + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(xfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + // merge previous pack to current pack source + exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack); + exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack); + + FILTER_SRC(yfilter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + sec += sec_stride; + } + + prev_src_pack = src_pack; + + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *sec, int sec_stride, + int do_sec, int height, unsigned int *sse) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i sum_reg = _mm256_setzero_si256(); + __m256i sse_reg = _mm256_setzero_si256(); + __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + int sum; + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + spv32_x0_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg); + // x_offset = 0 and y_offset = 4 + } else if (y_offset == 4) { + spv32_x0_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg); + // x_offset = 0 and y_offset = bilin interpolation + } else { + spv32_x0_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, y_offset); + } + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { + if (y_offset == 0) { + spv32_x4_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg); + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { + spv32_x4_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg); + // x_offset = 4 and y_offset = bilin interpolation + } else { + spv32_x4_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, y_offset); + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + spv32_xb_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, x_offset); + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { + spv32_xb_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, x_offset); + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + spv32_xb_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, + height, &sum_reg, &sse_reg, x_offset, y_offset); + } + } + CALC_SUM_AND_SSE + return sum; +} + +static unsigned int sub_pixel_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, int height, unsigned int *sse) { + return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, + NULL, 0, 0, height, sse); +} + +static unsigned int sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sse) { + return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, + sec, sec_stride, 1, height, sse); +} + typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int *sum); -void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum); - static void variance_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int w, int h, unsigned int *sse, int *sum, get_var_avx2 var_fn, @@ -44,7 +620,7 @@ unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, int sum; variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, vpx_get16x16var_avx2, 16); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, @@ -60,7 +636,7 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - vpx_get32x32var_avx2, 32); + get32x16var_avx2, 32); return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); } @@ -69,7 +645,7 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - vpx_get32x32var_avx2, 32); + get32x16var_avx2, 32); return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); } @@ -78,7 +654,7 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - vpx_get32x32var_avx2, 32); + get32x16var_avx2, 32); return *sse - (uint32_t)(((int64_t)sum * sum) >> 12); } @@ -87,32 +663,22 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - vpx_get32x32var_avx2, 32); + get32x16var_avx2, 32); return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, unsigned int *sse); - -unsigned int vpx_sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sseptr); - unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse) { unsigned int sse1; - const int se1 = vpx_sub_pixel_variance32xh_avx2( + const int se1 = sub_pixel_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); unsigned int sse2; const int se2 = - vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, - dst + 32, dst_stride, 64, &sse2); + sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, + dst + 32, dst_stride, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; return *sse - (uint32_t)(((int64_t)se * se) >> 12); @@ -123,7 +689,7 @@ unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse) { - const int se = vpx_sub_pixel_variance32xh_avx2( + const int se = sub_pixel_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } @@ -132,10 +698,10 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2( const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { unsigned int sse1; - const int se1 = vpx_sub_pixel_avg_variance32xh_avx2( + const int se1 = sub_pixel_avg_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); unsigned int sse2; - const int se2 = vpx_sub_pixel_avg_variance32xh_avx2( + const int se2 = sub_pixel_avg_variance32xh_avx2( src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, 64, 64, &sse2); const int se = se1 + se2; @@ -149,7 +715,7 @@ unsigned int vpx_sub_pixel_avg_variance32x32_avx2( const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { // Process 32 elements in parallel. - const int se = vpx_sub_pixel_avg_variance32xh_avx2( + const int se = sub_pixel_avg_variance32xh_avx2( src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } diff --git a/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c deleted file mode 100644 index 51e6b19ad1..0000000000 --- a/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c +++ /dev/null @@ -1,708 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // AVX2 - -#include "./vpx_dsp_rtcd.h" -#include "vpx_ports/mem.h" - -/* clang-format off */ -DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, -}; -/* clang-format on */ - -void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i, src_2strides, ref_2strides; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing two strides in a 256 bit register reducing the number - // of loop stride by half (comparing to the sse2 code) - src_2strides = source_stride << 1; - ref_2strides = recon_stride << 1; - for (i = 0; i < 8; i++) { - src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); - src = _mm256_inserti128_si256( - src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); - - ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); - ref = _mm256_inserti128_si256( - ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += src_2strides; - ref_ptr += ref_2strides; - } - - { - __m128i sum_res, madd_res; - __m128i expand_sum_low, expand_sum_high, expand_sum; - __m128i expand_madd_low, expand_madd_high, expand_madd; - __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // extract the low lane and add it to the high lane - sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), - _mm256_extractf128_si256(sum_ref_src, 1)); - - madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), - _mm256_extractf128_si256(madd_ref_src, 1)); - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = - _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - expand_sum_high = - _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - - // shifting the sign 16 bits right - expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = - _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - expand_madd_high = - _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - - expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = - _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - ex_expand_sum_high = - _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - - ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_res = _mm_srli_si128(expand_madd, 8); - sum_res = _mm_srli_si128(ex_expand_sum, 8); - - madd_res = _mm_add_epi32(madd_res, expand_madd); - sum_res = _mm_add_epi32(sum_res, ex_expand_sum); - - *((int *)SSE) = _mm_cvtsi128_si32(madd_res); - - *((int *)Sum) = _mm_cvtsi128_si32(sum_res); - } -} - -void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing 32 elements in parallel - for (i = 0; i < 16; i++) { - src = _mm256_loadu_si256((__m256i const *)(src_ptr)); - - ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - { - __m256i expand_sum_low, expand_sum_high, expand_sum; - __m256i expand_madd_low, expand_madd_high, expand_madd; - __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); - expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); - - // shifting the sign 16 bits right - expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); - expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); - - expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); - ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); - - ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_ref_src = _mm256_srli_si256(expand_madd, 8); - sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); - - madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); - sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); - - // extract the low lane and the high lane and add the results - *((int *)SSE) = - _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); - - *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); - } -} - -#define FILTER_SRC(filter) \ - /* filter the source */ \ - exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ - exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ - \ - /* add 8 to source */ \ - exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ - exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ - \ - /* divide source by 16 */ \ - exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ - exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); - -#define MERGE_WITH_SRC(src_reg, reg) \ - exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ - exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); - -#define LOAD_SRC_DST \ - /* load source and destination */ \ - src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ - dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); - -#define AVG_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ - /* average between current and next stride source */ \ - src_reg = _mm256_avg_epu8(src_reg, src_next_reg); - -#define MERGE_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ - MERGE_WITH_SRC(src_reg, src_next_reg) - -#define CALC_SUM_SSE_INSIDE_LOOP \ - /* expand each byte to 2 bytes */ \ - exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ - exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ - /* source - dest */ \ - exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ - exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ - /* caculate sum */ \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ - exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ - exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ - /* calculate sse */ \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); - -// final calculation to sum and sse -#define CALC_SUM_AND_SSE \ - res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ - sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ - sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ - \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ - \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); - -unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, unsigned int *sse) { - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height; i++) { - LOAD_SRC_DST - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - // save current source average - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src_pack = src_reg; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - } - CALC_SUM_AND_SSE - return sum; -} - -unsigned int vpx_sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sse) { - __m256i sec_reg; - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height; i++) { - LOAD_SRC_DST - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - sec += sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - MERGE_WITH_SRC(src_reg, zero_reg) - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - sec += sec_stride; - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - } - CALC_SUM_AND_SSE - return sum; -} diff --git a/libs/libvpx/vpx_dsp/x86/variance_sse2.c b/libs/libvpx/vpx_dsp/x86/variance_sse2.c index 1161da4914..8d8bf183b2 100644 --- a/libs/libvpx/vpx_dsp/x86/variance_sse2.c +++ b/libs/libvpx/vpx_dsp/x86/variance_sse2.c @@ -222,7 +222,7 @@ unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, unsigned int *sse) { int sum; vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, diff --git a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c index 727d9d1156..4f164afeb4 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c +++ b/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c @@ -41,38 +41,38 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); FUN_CONV_2D(, sse2); FUN_CONV_2D(avg_, sse2); @@ -140,22 +140,22 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h, int bd); // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h, int bd); HIGH_FUN_CONV_2D(, sse2); HIGH_FUN_CONV_2D(avg_, sse2); #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 diff --git a/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index e2311c1167..3f444e2e6a 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -20,21 +20,19 @@ SECTION .text %endif %ifidn %2, highbd %define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ +cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd + f, fxo, fxs, fyo, fys, w, h, bd %else %define pavg pavgb -cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ +cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ dst, dst_stride, \ - fx, fxs, fy, fys, w, h + f, fxo, fxs, fyo, fys, w, h %endif mov r4d, dword wm %ifidn %2, highbd shl r4d, 1 - shl srcq, 1 shl src_strideq, 1 - shl dstq, 1 shl dst_strideq, 1 %else cmp r4d, 4 diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm index bfc816f235..d83507dc99 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -197,6 +197,8 @@ movdqu [rdi + %2], xmm0 %endm +SECTION .text + ;void vpx_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm index 72f2ff71da..9bffe504b1 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm @@ -171,6 +171,8 @@ %endm %endif +SECTION .text + global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE sym(vpx_highbd_filter_block1d4_v2_sse2): push rbp diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 7c1ecc0148..d0919695ce 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -12,9 +12,10 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" #include "vpx_ports/mem.h" -// filters for 16_h8 and 16_v8 +// filters for 16_h8 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 @@ -35,493 +36,296 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -#if defined(__clang__) -#if (__clang_major__ > 0 && __clang_major__ < 3) || \ - (__clang_major__ == 3 && __clang_minor__ <= 3) || \ - (defined(__APPLE__) && defined(__apple_build_version__) && \ - ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ - (__clang_major__ == 5 && __clang_minor__ == 0))) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#else // clang > 3.3, and not 5.0 on macosx. -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // clang <= 3.3 -#elif defined(__GNUC__) -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 -#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) -#else // gcc > 4.7 -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // gcc <= 4.6 -#else // !(gcc || clang) -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // __clang__ - -static void vpx_filter_block1d16_h8_avx2( +static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1, srcReg32b2, filtersReg32; + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, + const int avg) { + __m128i outReg1, outReg2; + __m256i outReg32b1, outReg32b2; unsigned int i; ptrdiff_t src_stride, dst_stride; + __m256i f[4], filt[4], s[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - - filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + shuffle_filter_avx2(filter, f); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { + __m256i srcReg; + // load the 2 strides of source - srcReg32b1 = + srcReg = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256( - srcReg32b1, + srcReg = _mm256_inserti128_si256( + srcReg, _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), 1); // filter the source buffer - srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b1 = convolve8_16_avx2(s, f); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg32b2 = + srcReg = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256( - srcReg32b2, + srcReg = _mm256_inserti128_si256( + srcReg, _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), 1); - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - // filter the source buffer - srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b2 = convolve8_16_avx2(s, f); - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); - - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2); src_ptr += src_stride; + // average if necessary + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + outReg2 = _mm_avg_epu8( + outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch))); + } + // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, - _mm256_castsi256_si128(srcRegFilt32b1_1)); + _mm_store_si128((__m128i *)output_ptr, outReg1); // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), - _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2); + output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { - __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; - __m128i srcRegFilt2, srcRegFilt3; + __m128i srcReg; - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + // load the first 16 bytes of the last row + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = - _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + s[0] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); + s[1] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); + s[2] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); + s[3] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); + outReg1 = convolve8_8_avx2(s, f); // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // filter the source buffer - srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); + s[0] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); + s[1] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); + s[2] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); + s[3] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); + outReg2 = convolve8_8_avx2(s, f); - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = - _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg1 = _mm_packus_epi16(outReg1, outReg2); - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); - - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + // average if necessary + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + } // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); + _mm_store_si128((__m128i *)output_ptr, outReg1); } } -static void vpx_filter_block1d16_v8_avx2( +static void vpx_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, + ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, + output_height, filter, 0); +} + +static void vpx_filter_block1d16_h8_avg_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, + ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, + output_height, filter, 1); +} + +static INLINE void vpx_filter_block1d16_v8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg64; - __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; - __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; - __m256i srcReg32b11, srcReg32b12, filtersReg32; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, + const int avg) { + __m128i outReg1, outReg2; + __m256i srcRegHead1; unsigned int i; ptrdiff_t src_stride, dst_stride; + __m256i f[4], s1[4], s2[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + shuffle_filter_avx2(filter, f); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; - // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); - srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); - srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); - srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); - srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + { + __m128i s[6]; + __m256i s32b[6]; - // have each consecutive loads on the same 256 register - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); - srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); - srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); - srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); - srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); + // load 16 bytes 7 times in stride of src_pitch + s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch)); + srcRegHead1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch))); - // merge every two consecutive registers except the last one - srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); - srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + // have each consecutive loads on the same 256 register + s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), + _mm256_castsi256_si128(srcRegHead1), 1); - // save - srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - // save - srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + // merge every two consecutive registers except the last one + // the first lanes contain values for filtering odd rows (1,3,5...) and + // the second lanes contain values for filtering even rows (2,4,6...) + s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]); + s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]); + s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]); + s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]); + s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]); + s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]); + } for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two + __m256i srcRegHead2, srcRegHead3; + + // load the next 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register - srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); - srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, - _mm256_castsi256_si128(srcReg32b8), 1); - srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); - srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, - _mm256_castsi256_si128(srcReg32b9), 1); + srcRegHead2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch))); + srcRegHead1 = _mm256_inserti128_si256( + srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1); + srcRegHead3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch))); + srcRegHead2 = _mm256_inserti128_si256( + srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1); - // merge every two consecutive registers - // save - srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); - srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + // merge the two new consecutive registers + // the first lane contain values for filtering odd rows (1,3,5...) and + // the second lane contain values for filtering even rows (2,4,6...) + s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2); + s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2); - // multiply 2 adjacent elements with the filter and add the result - srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + s1[0] = convolve8_16_avx2(s1, f); + s2[0] = convolve8_16_avx2(s2, f); - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); - - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); - - // add and saturate the results together - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + s1[0] = _mm256_packus_epi16(s1[0], s2[0]); src_ptr += src_stride; + // average if necessary + outReg1 = _mm256_castsi256_si128(s1[0]); + outReg2 = _mm256_extractf128_si256(s1[0], 1); + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + outReg2 = _mm_avg_epu8( + outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch))); + } + // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); + _mm_store_si128((__m128i *)output_ptr, outReg1); // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), - _mm256_extractf128_si256(srcReg32b1, 1)); + _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2); output_ptr += dst_stride; - // save part of the registers for next strides - srcReg32b10 = srcReg32b11; - srcReg32b1 = srcReg32b3; - srcReg32b11 = srcReg32b2; - srcReg32b3 = srcReg32b5; - srcReg32b2 = srcReg32b4; - srcReg32b5 = srcReg32b7; - srcReg32b7 = srcReg32b9; + // shift down by two rows + s1[0] = s1[1]; + s2[0] = s2[1]; + s1[1] = s1[2]; + s2[1] = s2[2]; + s1[2] = s1[3]; + s2[2] = s2[3]; + srcRegHead1 = srcRegHead3; } + + // if the number of strides is odd. + // process only 16 bytes if (i > 0) { - __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; - __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + const __m128i srcRegHead2 = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together - srcRegFilt4 = - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - srcRegFilt7 = - _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + s1[0] = _mm256_castsi128_si256( + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); + s2[0] = _mm256_castsi128_si256( + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt4 = - _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); - srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt7 = - _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); + outReg1 = convolve8_8_avx2(s1, f); + outReg2 = convolve8_8_avx2(s2, f); - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg1 = _mm_packus_epi16(outReg1, outReg2); - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), - _mm256_castsi256_si128(secondFilters)); - srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), - _mm256_castsi256_si128(secondFilters)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), - _mm256_castsi256_si128(thirdFilters)); - srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), - _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); - - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + // average if necessary + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + } // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); + _mm_store_si128((__m128i *)output_ptr, outReg1); } } +static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *filter) { + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, filter, 0); +} + +static void vpx_filter_block1d16_v8_avg_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) { + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, filter, 1); +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if ARCH_X86_64 @@ -539,6 +343,14 @@ filter8_1dfunction vpx_filter_block1d4_h8_ssse3; #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // ARCH_X86_64 +filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; +#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3 +#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3 +#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3 +#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; filter8_1dfunction vpx_filter_block1d16_h2_ssse3; filter8_1dfunction vpx_filter_block1d8_v2_ssse3; @@ -552,23 +364,53 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3; #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 +filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; +#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3 +#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3 +#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3 +#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3 +#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3 +#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); +// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); FUN_CONV_2D(, avx2); +FUN_CONV_2D(avg_, avx2); #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index b26d97b455..e4f992780f 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -8,52 +8,37 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include +#include // SSSE3 + +#include #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_filter.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" - -// filters only for the 4_h8 convolution -DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -// filters for 8_h8 and 16_h8 -DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; // These are reused by the avx2 intrinsics. -filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +// vpx_filter_block1d8_v8_intrin_ssse3() +// vpx_filter_block1d8_h8_intrin_ssse3() +// vpx_filter_block1d4_h8_intrin_ssse3() + +static INLINE __m128i shuffle_filter_convolve8_8_ssse3( + const __m128i *const s, const int16_t *const filter) { + __m128i f[4]; + shuffle_filter_ssse3(filter, f); + return convolve8_8_ssse3(s, f); +} void vpx_filter_block1d4_h8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, srcReg, minReg; + __m128i srcRegFilt1, srcRegFilt2; + __m128i addFilterReg64, filtersReg, srcReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 @@ -75,8 +60,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3( secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters - shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); - shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6); + shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); @@ -89,25 +74,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3( srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + // sum the results together, saturating only on the final step + // the specific order of the additions prevents outranges + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2); - minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + // extract the higher half of the register + srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); - // add and saturate all the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + // add the rounding offset early to avoid another saturated add + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - src_ptr += src_pixels_per_line; + src_ptr += src_pitch; // save only 4 bytes *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); @@ -117,77 +100,35 @@ void vpx_filter_block1d4_h8_intrin_ssse3( } void vpx_filter_block1d8_h8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, minReg; unsigned int i; + __m128i f[4], filt[4], s[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + shuffle_filter_ssse3(filter, f); + filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); + filt[3] = + _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); - srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); - srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); - - // add and saturate all the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + s[0] = _mm_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm_shuffle_epi8(srcReg, filt[3]); + s[0] = convolve8_8_ssse3(s, f); // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + s[0] = _mm_packus_epi16(s[0], s[0]); - src_ptr += src_pixels_per_line; + src_ptr += src_pitch; // save only 8 bytes - _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); output_ptr += output_pitch; } @@ -196,83 +137,49 @@ void vpx_filter_block1d8_h8_intrin_ssse3( void vpx_filter_block1d8_v8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; - __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; - __m128i srcReg8; unsigned int i; + __m128i f[4], s[8], ss[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + shuffle_filter_ssse3(filter, f); // load the first 7 rows of 8 bytes - srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); for (i = 0; i < output_height; i++) { // load the last 8 bytes - srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); - srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + ss[0] = convolve8_8_ssse3(ss, f); // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + ss[0] = _mm_packus_epi16(ss[0], ss[0]); src_ptr += src_pitch; // shift down a row - srcReg1 = srcReg2; - srcReg2 = srcReg3; - srcReg3 = srcReg4; - srcReg4 = srcReg5; - srcReg5 = srcReg6; - srcReg6 = srcReg7; - srcReg7 = srcReg8; + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; // save only 8 bytes convolve result - _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]); output_ptr += out_pitch; } @@ -306,149 +213,69 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3); -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ - const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ - const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ - const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ - \ - const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ - const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ - const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ - const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ - \ - out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ - out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ - out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ - out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ - out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ - out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ - out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ - out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ - } +static void filter_horiz_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const x_filter) { + __m128i s[8], ss[4], temp; -static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *x_filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 - const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); - // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 - const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 - const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); - // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 - const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); - const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); - const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); - const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, ss); + temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); } -static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A, B, C, D, E, F, G, H; +static void transpose8x8_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[8]; - A = _mm_loadl_epi64((const __m128i *)src); - B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); - C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); - F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); - G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); - H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); - - TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); - - _mm_storel_epi64((__m128i *)dst, A); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); + load_8bit_8x8(src, src_stride, s); + transpose_8bit_8x8(s, s); + store_8bit_8x8(s, dst, dst_stride); } -static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { +static void scaledconvolve_horiz_w8(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; - // This function processes 8x8 areas. The intermediate height is not always + // This function processes 8x8 areas. The intermediate height is not always // a multiple of 8, so force it to be a multiple of 8 here. y = h + (8 - (h & 0x7)); @@ -479,93 +306,50 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, } while (y -= 8); } -static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - // TRANSPOSE... - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // - // TO - // - // 00 10 20 30 - // 01 11 21 31 - // 02 12 22 32 - // 03 13 23 33 - // 04 14 24 34 - // 05 15 25 35 - // 06 16 26 36 - // 07 17 27 37 - // - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); +static void filter_horiz_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter) { + __m128i s[4], ss[2]; + __m128i temp; + + load_8bit_8x4(src, src_stride, s); + transpose_16bit_4x4(s, ss); + // 00 01 10 11 20 21 30 31 + s[0] = ss[0]; // 02 03 12 13 22 23 32 33 - const __m128i s3s2 = _mm_srli_si128(s1s0, 8); + s[1] = _mm_srli_si128(ss[0], 8); + // 04 05 14 15 24 25 34 35 + s[2] = ss[1]; // 06 07 16 17 26 27 36 37 - const __m128i s7s6 = _mm_srli_si128(s5s4, 8); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); + s[3] = _mm_srli_si128(ss[1], 8); + + temp = shuffle_filter_convolve8_8_ssse3(s, filter); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); } -static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A = _mm_cvtsi32_si128(*(const int *)src); - __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); - __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); - __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); - // 00 10 01 11 02 12 03 13 - const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); - // 20 30 21 31 22 32 23 33 - const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - A = _mm_unpacklo_epi16(tr0_0, tr0_1); - B = _mm_srli_si128(A, 4); - C = _mm_srli_si128(A, 8); - D = _mm_srli_si128(A, 12); +static void transpose4x4_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[4]; - *(int *)(dst) = _mm_cvtsi128_si32(A); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); - *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); - *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); + load_8bit_4x4(src, src_stride, s); + s[0] = transpose_8bit_4x4(s); + s[1] = _mm_srli_si128(s[0], 4); + s[2] = _mm_srli_si128(s[0], 8); + s[3] = _mm_srli_si128(s[0], 12); + store_8bit_4x4(s, dst, dst_stride); } -static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { +static void scaledconvolve_horiz_w4(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; @@ -597,50 +381,41 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); - const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); - const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); +static __m128i filter_vert_kernel(const __m128i *const s, + const int16_t *const filter) { + __m128i ss[4]; + __m128i temp; + + // 00 10 01 11 02 12 03 13 + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + // 20 30 21 31 22 32 23 33 + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + // 40 50 41 51 42 52 43 53 + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + // 60 70 61 71 62 72 63 73 + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + + temp = shuffle_filter_convolve8_8_ssse3(ss, filter); // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); + return _mm_packus_epi16(temp, temp); +} + +static void filter_vert_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8]; + __m128i temp; + + load_8bit_4x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); } -static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -659,50 +434,21 @@ static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); +static void filter_vert_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8], temp; + + load_8bit_8x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); } -static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -719,81 +465,44 @@ static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter, int w) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); +static void filter_vert_w16_ssse3(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter, const int w) { int i; + __m128i f[4]; + shuffle_filter_ssse3(filter, f); for (i = 0; i < w; i += 16) { - const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); - const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - // merge the result together - const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); - const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); - const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); - const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); - const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); - const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); - const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); - // add and saturate the results together - const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); - const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); - // merge the result together - const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); - const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); - const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); - // merge the result together - const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); - const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); - const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); - // add and saturate the results together - __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); - __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); + __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi; - // add and saturate the results together - temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); - temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); - // round and shift by 7 bit each 16 bit - temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); - temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result + loadu_8bit_16x8(src, src_stride, s); + + // merge the result together + s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]); + s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]); + s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]); + s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]); + s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]); + s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]); + s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]); + s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]); + temp_lo = convolve8_8_ssse3(s_lo, f); + temp_hi = convolve8_8_ssse3(s_hi, f); + + // shrink to 8 bit each 16 bits, the first lane contain the first convolve + // result and the second lane contain the second convolve result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); - src_ptr += 16; + src += 16; // save 16 bytes convolve result _mm_store_si128((__m128i *)&dst[i], temp_hi); } } -static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -811,11 +520,10 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, } } -static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -829,70 +537,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); if (w >= 8) { scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); } else { scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); } if (w >= 16) { scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } else if (w == 8) { scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } else { scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } } -static const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h); -} - // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); FUN_CONV_2D(, ssse3); FUN_CONV_2D(avg_, ssse3); diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm index 08f3d6a6cf..8497e1721b 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm @@ -176,6 +176,8 @@ movq [rdi + %2], xmm0 %endm +SECTION .text + ;void vpx_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm index c1a6f23abe..952d9307d1 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm @@ -327,12 +327,12 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ %endm INIT_XMM ssse3 -SUBPIX_HFILTER16 h8 -SUBPIX_HFILTER16 h8_avg -SUBPIX_HFILTER8 h8 -SUBPIX_HFILTER8 h8_avg -SUBPIX_HFILTER4 h8 -SUBPIX_HFILTER4 h8_avg +SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3 +SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3 +SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3 +SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3 +SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3 +SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3 ;------------------------------------------------------------------------------- @@ -795,9 +795,9 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ %endm INIT_XMM ssse3 -SUBPIX_VFILTER16 v8 -SUBPIX_VFILTER16 v8_avg -SUBPIX_VFILTER v8, 8 -SUBPIX_VFILTER v8_avg, 8 -SUBPIX_VFILTER v8, 4 -SUBPIX_VFILTER v8_avg, 4 +SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3 +SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3 +SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3 +SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3 +SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3 +SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3 diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm index a378dd0402..6d79492e4a 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm @@ -131,6 +131,8 @@ dec rcx %endm +SECTION .text + global sym(vpx_filter_block1d4_v2_sse2) PRIVATE sym(vpx_filter_block1d4_v2_sse2): push rbp diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm index 538b2129db..8c9c817be7 100644 --- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm +++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm @@ -105,6 +105,8 @@ dec rcx %endm +SECTION .text + global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE sym(vpx_filter_block1d4_v2_ssse3): push rbp diff --git a/libs/libvpx/vpx_mem/vpx_mem.c b/libs/libvpx/vpx_mem/vpx_mem.c index c94ed52d16..eeba34c373 100644 --- a/libs/libvpx/vpx_mem/vpx_mem.c +++ b/libs/libvpx/vpx_mem/vpx_mem.c @@ -76,50 +76,9 @@ void *vpx_calloc(size_t num, size_t size) { return x; } -void *vpx_realloc(void *memblk, size_t size) { - void *new_addr = NULL; - - /* - The realloc() function changes the size of the object pointed to by - ptr to the size specified by size, and returns a pointer to the - possibly moved block. The contents are unchanged up to the lesser - of the new and old sizes. If ptr is null, realloc() behaves like - malloc() for the specified size. If size is zero (0) and ptr is - not a null pointer, the object pointed to is freed. - */ - if (!memblk) - new_addr = vpx_malloc(size); - else if (!size) - vpx_free(memblk); - else { - void *addr = get_actual_malloc_address(memblk); - const uint64_t aligned_size = - get_aligned_malloc_size(size, DEFAULT_ALIGNMENT); - if (!check_size_argument_overflow(1, aligned_size)) return NULL; - - addr = realloc(addr, (size_t)aligned_size); - if (addr) { - new_addr = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, - DEFAULT_ALIGNMENT); - set_actual_malloc_address(new_addr, addr); - } - } - - return new_addr; -} - void vpx_free(void *memblk) { if (memblk) { void *addr = get_actual_malloc_address(memblk); free(addr); } } - -#if CONFIG_VP9_HIGHBITDEPTH -void *vpx_memset16(void *dest, int val, size_t length) { - size_t i; - uint16_t *dest16 = (uint16_t *)dest; - for (i = 0; i < length; i++) *dest16++ = val; - return dest; -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libs/libvpx/vpx_mem/vpx_mem.h b/libs/libvpx/vpx_mem/vpx_mem.h index c14f288b89..a4274b8856 100644 --- a/libs/libvpx/vpx_mem/vpx_mem.h +++ b/libs/libvpx/vpx_mem/vpx_mem.h @@ -19,6 +19,8 @@ #include #include +#include "vpx/vpx_integer.h" + #if defined(__cplusplus) extern "C" { #endif @@ -26,11 +28,15 @@ extern "C" { void *vpx_memalign(size_t align, size_t size); void *vpx_malloc(size_t size); void *vpx_calloc(size_t num, size_t size); -void *vpx_realloc(void *memblk, size_t size); void vpx_free(void *memblk); #if CONFIG_VP9_HIGHBITDEPTH -void *vpx_memset16(void *dest, int val, size_t length); +static INLINE void *vpx_memset16(void *dest, int val, size_t length) { + size_t i; + uint16_t *dest16 = (uint16_t *)dest; + for (i = 0; i < length; i++) *dest16++ = val; + return dest; +} #endif #include diff --git a/libs/libvpx/vpx_ports/arm_cpudetect.c b/libs/libvpx/vpx_ports/arm_cpudetect.c index 79c60f7a19..4f9d480ade 100644 --- a/libs/libvpx/vpx_ports/arm_cpudetect.c +++ b/libs/libvpx/vpx_ports/arm_cpudetect.c @@ -58,8 +58,12 @@ int arm_cpu_caps(void) { #elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */ /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif +#ifndef WIN32_EXTRA_LEAN #define WIN32_EXTRA_LEAN +#endif #include int arm_cpu_caps(void) { diff --git a/libs/libvpx/vpx_ports/asmdefs_mmi.h b/libs/libvpx/vpx_ports/asmdefs_mmi.h new file mode 100644 index 0000000000..a9a49745af --- /dev/null +++ b/libs/libvpx/vpx_ports/asmdefs_mmi.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_ASMDEFS_MMI_H_ +#define VPX_PORTS_ASMDEFS_MMI_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#if HAVE_MMI + +#if HAVE_MIPS64 +#define mips_reg int64_t +#define MMI_ADDU(reg1, reg2, reg3) \ + "daddu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_ADDIU(reg1, reg2, immediate) \ + "daddiu " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_ADDI(reg1, reg2, immediate) \ + "daddi " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_SUBU(reg1, reg2, reg3) \ + "dsubu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_L(reg, addr, bias) \ + "ld " #reg ", " #bias "(" #addr ") \n\t" + +#define MMI_SRL(reg1, reg2, shift) \ + "dsrl " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_SLL(reg1, reg2, shift) \ + "dsll " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_MTC1(reg, fp) \ + "dmtc1 " #reg ", " #fp " \n\t" + +#define MMI_LI(reg, immediate) \ + "dli " #reg ", " #immediate " \n\t" + +#else +#define mips_reg int32_t +#define MMI_ADDU(reg1, reg2, reg3) \ + "addu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_ADDIU(reg1, reg2, immediate) \ + "addiu " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_ADDI(reg1, reg2, immediate) \ + "addi " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_SUBU(reg1, reg2, reg3) \ + "subu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_L(reg, addr, bias) \ + "lw " #reg ", " #bias "(" #addr ") \n\t" + +#define MMI_SRL(reg1, reg2, shift) \ + "srl " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_SLL(reg1, reg2, shift) \ + "sll " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_MTC1(reg, fp) \ + "mtc1 " #reg ", " #fp " \n\t" + +#define MMI_LI(reg, immediate) \ + "li " #reg ", " #immediate " \n\t" + +#endif /* HAVE_MIPS64 */ + +#endif /* HAVE_MMI */ + +#endif /* VPX_PORTS_ASMDEFS_MMI_H_ */ diff --git a/libs/libvpx/vpx_ports/mem.h b/libs/libvpx/vpx_ports/mem.h index 2d49b7a06d..bfef783b13 100644 --- a/libs/libvpx/vpx_ports/mem.h +++ b/libs/libvpx/vpx_ports/mem.h @@ -23,16 +23,6 @@ #define DECLARE_ALIGNED(n, typ, val) typ val #endif -/* Indicates that the usage of the specified variable has been audited to assure - * that it's safe to use uninitialized. Silences 'may be used uninitialized' - * warnings on gcc. - */ -#if defined(__GNUC__) && __GNUC__ -#define UNINITIALIZED_IS_SAFE(x) x = x -#else -#define UNINITIALIZED_IS_SAFE(x) x -#endif - #if HAVE_NEON && defined(_MSC_VER) #define __builtin_prefetch(x) #endif @@ -45,8 +35,10 @@ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1)) +#define CAST_TO_SHORTPTR(x) ((uint16_t *)((uintptr_t)(x))) #if CONFIG_VP9_HIGHBITDEPTH #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) +#define CAST_TO_BYTEPTR(x) ((uint8_t *)((uintptr_t)(x))) #endif // CONFIG_VP9_HIGHBITDEPTH #if !defined(__has_feature) diff --git a/libs/libvpx/vpx_ports/ppc.h b/libs/libvpx/vpx_ports/ppc.h new file mode 100644 index 0000000000..ed29ef25b4 --- /dev/null +++ b/libs/libvpx/vpx_ports/ppc.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_PPC_H_ +#define VPX_PORTS_PPC_H_ +#include + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_VSX 0x01 + +int ppc_simd_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_PORTS_PPC_H_ diff --git a/libs/libvpx/vpx_ports/ppc_cpudetect.c b/libs/libvpx/vpx_ports/ppc_cpudetect.c new file mode 100644 index 0000000000..374a0271c9 --- /dev/null +++ b/libs/libvpx/vpx_ports/ppc_cpudetect.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "./vpx_config.h" +#include "vpx_ports/ppc.h" + +#if CONFIG_RUNTIME_CPU_DETECT +static int cpu_env_flags(int *flags) { + char *env; + env = getenv("VPX_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 0; + } + *flags = 0; + return -1; +} + +static int cpu_env_mask(void) { + char *env; + env = getenv("VPX_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} + +int ppc_simd_caps(void) { + int flags; + int mask; + int fd; + ssize_t count; + unsigned int i; + uint64_t buf[64]; + + // If VPX_SIMD_CAPS is set then allow only those capabilities. + if (!cpu_env_flags(&flags)) { + return flags; + } + + mask = cpu_env_mask(); + + fd = open("/proc/self/auxv", O_RDONLY); + if (fd < 0) { + return 0; + } + + while ((count = read(fd, buf, sizeof(buf))) > 0) { + for (i = 0; i < (count / sizeof(*buf)); i += 2) { + if (buf[i] == AT_HWCAP) { +#if HAVE_VSX + if (buf[i + 1] & PPC_FEATURE_HAS_VSX) { + flags |= HAS_VSX; + } +#endif // HAVE_VSX + goto out_close; + } else if (buf[i] == AT_NULL) { + goto out_close; + } + } + } +out_close: + close(fd); + return flags & mask; +} +#else +// If there is no RTCD the function pointers are not used and can not be +// changed. +int ppc_simd_caps(void) { return 0; } +#endif // CONFIG_RUNTIME_CPU_DETECT diff --git a/libs/libvpx/vpx_ports/vpx_ports.mk b/libs/libvpx/vpx_ports/vpx_ports.mk index 36b14936df..e17145e6cb 100644 --- a/libs/libvpx/vpx_ports/vpx_ports.mk +++ b/libs/libvpx/vpx_ports/vpx_ports.mk @@ -25,3 +25,10 @@ endif PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c PORTS_SRCS-$(ARCH_ARM) += arm.h + +PORTS_SRCS-$(ARCH_PPC) += ppc_cpudetect.c +PORTS_SRCS-$(ARCH_PPC) += ppc.h + +ifeq ($(ARCH_MIPS), yes) +PORTS_SRCS-yes += asmdefs_mmi.h +endif diff --git a/libs/libvpx/vpx_ports/vpx_timer.h b/libs/libvpx/vpx_ports/vpx_timer.h index 4aae30e947..2083b4ece4 100644 --- a/libs/libvpx/vpx_ports/vpx_timer.h +++ b/libs/libvpx/vpx_ports/vpx_timer.h @@ -21,6 +21,8 @@ /* * Win32 specific includes */ +#undef NOMINMAX +#define NOMINMAX #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif @@ -81,7 +83,7 @@ static INLINE int64_t vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { struct timeval diff; timersub(&t->end, &t->begin, &diff); - return diff.tv_sec * 1000000 + diff.tv_usec; + return (int64_t)diff.tv_sec * 1000000 + diff.tv_usec; #endif } diff --git a/libs/libvpx/vpx_ports/x86.h b/libs/libvpx/vpx_ports/x86.h index 6ba02cf1fc..ced65ac058 100644 --- a/libs/libvpx/vpx_ports/x86.h +++ b/libs/libvpx/vpx_ports/x86.h @@ -140,22 +140,28 @@ static INLINE uint64_t xgetbv(void) { #endif #if defined(_MSC_VER) && _MSC_VER >= 1700 +#undef NOMINMAX +#define NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif #include #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) #define getenv(x) NULL #endif #endif -#define HAS_MMX 0x01 -#define HAS_SSE 0x02 -#define HAS_SSE2 0x04 -#define HAS_SSE3 0x08 -#define HAS_SSSE3 0x10 -#define HAS_SSE4_1 0x20 -#define HAS_AVX 0x40 -#define HAS_AVX2 0x80 +#define HAS_MMX 0x001 +#define HAS_SSE 0x002 +#define HAS_SSE2 0x004 +#define HAS_SSE3 0x008 +#define HAS_SSSE3 0x010 +#define HAS_SSE4_1 0x020 +#define HAS_AVX 0x040 +#define HAS_AVX2 0x080 +#define HAS_AVX512 0x100 #ifndef BIT -#define BIT(n) (1 << n) +#define BIT(n) (1u << n) #endif static INLINE int x86_simd_caps(void) { @@ -204,6 +210,12 @@ static INLINE int x86_simd_caps(void) { cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + + // bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) & + // 30 (AVX-512BW) & 32 (AVX-512VL) + if ((reg_ebx & (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) == + (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) + flags |= HAS_AVX512; } } } diff --git a/libs/libvpx/vpx_scale/generic/yv12config.c b/libs/libvpx/vpx_scale/generic/yv12config.c index a674eac84b..9c7ca42c78 100644 --- a/libs/libvpx/vpx_scale/generic/yv12config.c +++ b/libs/libvpx/vpx_scale/generic/yv12config.c @@ -9,6 +9,7 @@ */ #include +#include #include "vpx_scale/yv12config.h" #include "vpx_mem/vpx_mem.h" @@ -165,6 +166,12 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, uint8_t *buf = NULL; + // frame_size is stored in buffer_alloc_sz, which is an int. If it won't + // fit, fail early. + if (frame_size > INT_MAX) { + return -1; + } + if (cb != NULL) { const int align_addr_extra_size = 31; const uint64_t external_frame_size = frame_size + align_addr_extra_size; @@ -193,8 +200,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, vpx_free(ybf->buffer_alloc); ybf->buffer_alloc = NULL; - if (frame_size != (size_t)frame_size) return -1; - ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size); if (!ybf->buffer_alloc) return -1; diff --git a/libs/libvpx/vpx_scale/generic/yv12extend.c b/libs/libvpx/vpx_scale/generic/yv12extend.c index a6aaff95a0..e231806505 100644 --- a/libs/libvpx/vpx_scale/generic/yv12extend.c +++ b/libs/libvpx/vpx_scale/generic/yv12extend.c @@ -111,25 +111,6 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); -#if CONFIG_VP9_HIGHBITDEPTH - if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { - extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, - ybf->y_crop_height, ybf->border, ybf->border, - ybf->border + ybf->y_height - ybf->y_crop_height, - ybf->border + ybf->y_width - ybf->y_crop_width); - - extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width, - ybf->uv_crop_height, uv_border, uv_border, - uv_border + ybf->uv_height - ybf->uv_crop_height, - uv_border + ybf->uv_width - ybf->uv_crop_width); - - extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width, - ybf->uv_crop_height, uv_border, uv_border, - uv_border + ybf->uv_height - ybf->uv_crop_height, - uv_border + ybf->uv_width - ybf->uv_crop_width); - return; - } -#endif extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, ybf->border, ybf->border, ybf->border + ybf->y_height - ybf->y_crop_height, @@ -208,12 +189,55 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { // Copies the source image into the destination image and updates the // destination's UMV borders. // Note: The frames are assumed to be identical in size. + void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) { int row; const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; +#if 0 + /* These assertions are valid in the codec, but the libvpx-tester uses + * this code slightly differently. + */ + assert(src_ybc->y_width == dst_ybc->y_width); + assert(src_ybc->y_height == dst_ybc->y_height); +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_c(dst_ybc); +} + +#if CONFIG_VP9 +void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + #if 0 /* These assertions are valid in the codec, but the libvpx-tester uses * this code slightly differently. @@ -249,7 +273,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, dst += dst_ybc->uv_stride; } - vp8_yv12_extend_frame_borders_c(dst_ybc); + vpx_extend_frame_borders_c(dst_ybc); return; } else { assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH)); @@ -280,8 +304,9 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, dst += dst_ybc->uv_stride; } - vp8_yv12_extend_frame_borders_c(dst_ybc); + vpx_extend_frame_borders_c(dst_ybc); } +#endif // CONFIG_VP9 void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) { diff --git a/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl b/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl index 44b115c7eb..1281071a7d 100644 --- a/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl +++ b/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vpx_scale_forward_decls() { print < or +// since neither is guaranteed to exist on both C and C++ platforms, and we need +// to back the atomic type with the same type (g++ needs to be able to use +// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the +// stdatomic.h header. Even if both and existed it's not +// guaranteed that atomic_int is the same type as std::atomic_int. +// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13. +#if !defined(__has_builtin) +#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif // !defined(__has_builtin) + +#if (__has_builtin(__atomic_load_n)) || \ + (defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) +// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those. +#define VPX_USE_ATOMIC_BUILTINS +#else +// Use platform-specific asm barriers. +#if defined(_MSC_VER) +// TODO(pbos): This assumes that newer versions of MSVC are building with the +// default /volatile:ms (or older, where this is always true. Consider adding +// support for using instead of stdatomic.h when building C++11 under +// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?), +// there're no explicit Interlocked* functions for only storing or loading +// (presumably because volatile has historically implied that on MSVC). +// +// For earlier versions of MSVC or the default /volatile:ms volatile int are +// acquire/release and require no barrier. +#define vpx_atomic_memory_barrier() \ + do { \ + } while (0) +#else +#if ARCH_X86 || ARCH_X86_64 +// Use a compiler barrier on x86, no runtime penalty. +#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory") +#elif ARCH_ARM +#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory") +#elif ARCH_MIPS +#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory") +#else +#error Unsupported architecture! +#endif // ARCH_X86 || ARCH_X86_64 +#endif // defined(_MSC_VER) +#endif // atomic builtin availability check + +// These are wrapped in a struct so that they are not easily accessed directly +// on any platform (to discourage programmer errors by setting values directly). +// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT +// (NOT memset) and accessed through vpx_atomic_ functions. +typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int; + +#define VPX_ATOMIC_INIT(num) \ + { num } + +// Initialization of an atomic int, not thread safe. +static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) { + atomic->value = value; +} + +static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) { +#if defined(VPX_USE_ATOMIC_BUILTINS) + __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE); +#else + vpx_atomic_memory_barrier(); + atomic->value = value; +#endif // defined(VPX_USE_ATOMIC_BUILTINS) +} + +static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) { +#if defined(VPX_USE_ATOMIC_BUILTINS) + return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE); +#else + int v = atomic->value; + vpx_atomic_memory_barrier(); + return v; +#endif // defined(VPX_USE_ATOMIC_BUILTINS) +} + +#undef VPX_USE_ATOMIC_BUILTINS +#undef vpx_atomic_memory_barrier + +#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // VPX_UTIL_VPX_ATOMICS_H_ diff --git a/libs/libvpx/vpx_util/vpx_util.mk b/libs/libvpx/vpx_util/vpx_util.mk index c0ef8d3362..86d3ece3c8 100644 --- a/libs/libvpx/vpx_util/vpx_util.mk +++ b/libs/libvpx/vpx_util/vpx_util.mk @@ -8,7 +8,10 @@ ## be found in the AUTHORS file in the root of the source tree. ## +UTIL_SRCS-yes += vpx_atomics.h UTIL_SRCS-yes += vpx_util.mk UTIL_SRCS-yes += vpx_thread.c UTIL_SRCS-yes += vpx_thread.h UTIL_SRCS-yes += endian_inl.h +UTIL_SRCS-yes += vpx_write_yuv_frame.h +UTIL_SRCS-yes += vpx_write_yuv_frame.c diff --git a/libs/libvpx/vpx_util/vpx_write_yuv_frame.c b/libs/libvpx/vpx_util/vpx_write_yuv_frame.c new file mode 100644 index 0000000000..ab68558115 --- /dev/null +++ b/libs/libvpx/vpx_util/vpx_write_yuv_frame.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/skin_detection.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { +#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \ + defined(OUTPUT_YUV_SKINMAP) + + unsigned char *src = s->y_buffer; + int h = s->y_crop_height; + + do { + fwrite(src, s->y_width, 1, yuv_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + +#else + (void)yuv_file; + (void)s; +#endif +} diff --git a/libs/libvpx/vpx_util/vpx_write_yuv_frame.h b/libs/libvpx/vpx_util/vpx_write_yuv_frame.h new file mode 100644 index 0000000000..1cb7029817 --- /dev/null +++ b/libs/libvpx/vpx_util/vpx_write_yuv_frame.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ +#define VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ + +#include +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ diff --git a/libs/libvpx/vpxdec.c b/libs/libvpx/vpxdec.c index ab638ec6b8..ff20e6a3c9 100644 --- a/libs/libvpx/vpxdec.c +++ b/libs/libvpx/vpxdec.c @@ -9,11 +9,11 @@ */ #include +#include +#include #include #include -#include #include -#include #include "./vpx_config.h" @@ -47,6 +47,8 @@ struct VpxDecInputContext { struct WebmInputContext *webm_ctx; }; +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t looparg = ARG_DEF(NULL, "loops", 1, "Number of times to decode the file"); static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); @@ -75,7 +77,7 @@ static const arg_def_t outputfile = static const arg_def_t threadsarg = ARG_DEF("t", "threads", 1, "Max threads to use"); static const arg_def_t frameparallelarg = - ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode"); + ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode (ignored)"); static const arg_def_t verbosearg = ARG_DEF("v", "verbose", 0, "Show version string"); static const arg_def_t error_concealment = @@ -92,31 +94,24 @@ static const arg_def_t md5arg = static const arg_def_t outbitdeptharg = ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames"); #endif +static const arg_def_t svcdecodingarg = ARG_DEF( + NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer"); +static const arg_def_t framestatsarg = + ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); -static const arg_def_t *all_args[] = { &codecarg, - &use_yv12, - &use_i420, - &flipuvarg, - &rawvideo, - &noblitarg, - &progressarg, - &limitarg, - &skiparg, - &postprocarg, - &summaryarg, - &outputfile, - &threadsarg, - &frameparallelarg, - &verbosearg, - &scalearg, - &fb_arg, - &md5arg, - &error_concealment, - &continuearg, +static const arg_def_t *all_args[] = { + &help, &codecarg, &use_yv12, + &use_i420, &flipuvarg, &rawvideo, + &noblitarg, &progressarg, &limitarg, + &skiparg, &postprocarg, &summaryarg, + &outputfile, &threadsarg, &frameparallelarg, + &verbosearg, &scalearg, &fb_arg, + &md5arg, &error_concealment, &continuearg, #if CONFIG_VP9_HIGHBITDEPTH - &outbitdeptharg, + &outbitdeptharg, #endif - NULL }; + &svcdecodingarg, &framestatsarg, NULL +}; #if CONFIG_VP8_DECODER static const arg_def_t addnoise_level = @@ -159,41 +154,47 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst, dst->d_h, mode); } #endif - -void usage_exit(void) { +void show_help(FILE *fout, int shorthelp) { int i; - fprintf(stderr, - "Usage: %s filename\n\n" - "Options:\n", - exec_name); - arg_show_usage(stderr, all_args); + fprintf(fout, "Usage: %s filename\n\n", exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "Options:\n"); + arg_show_usage(fout, all_args); #if CONFIG_VP8_DECODER - fprintf(stderr, "\nVP8 Postprocessing Options:\n"); - arg_show_usage(stderr, vp8_pp_args); + fprintf(fout, "\nVP8 Postprocessing Options:\n"); + arg_show_usage(fout, vp8_pp_args); #endif - fprintf(stderr, + fprintf(fout, "\nOutput File Patterns:\n\n" " The -o argument specifies the name of the file(s) to " "write to. If the\n argument does not include any escape " "characters, the output will be\n written to a single file. " "Otherwise, the filename will be calculated by\n expanding " "the following escape characters:\n"); - fprintf(stderr, + fprintf(fout, "\n\t%%w - Frame width" "\n\t%%h - Frame height" "\n\t%% - Frame number, zero padded to places (1..9)" "\n\n Pattern arguments are only supported in conjunction " "with the --yv12 and\n --i420 options. If the -o option is " "not specified, the output will be\n directed to stdout.\n"); - fprintf(stderr, "\nIncluded decoders:\n\n"); + fprintf(fout, "\nIncluded decoders:\n\n"); for (i = 0; i < get_vpx_decoder_count(); ++i) { const VpxInterface *const decoder = get_vpx_decoder_by_index(i); - fprintf(stderr, " %-6s - %s\n", decoder->name, + fprintf(fout, " %-6s - %s\n", decoder->name, vpx_codec_iface_name(decoder->codec_interface())); } +} +void usage_exit(void) { + show_help(stderr, 1); exit(EXIT_FAILURE); } @@ -495,11 +496,12 @@ static int main_loop(int argc, const char **argv_) { vpx_codec_ctx_t decoder; char *fn = NULL; int i; + int ret = EXIT_FAILURE; uint8_t *buf = NULL; size_t bytes_in_buffer = 0, buffer_size = 0; FILE *infile; int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0; - int do_md5 = 0, progress = 0, frame_parallel = 0; + int do_md5 = 0, progress = 0; int stop_after = 0, postproc = 0, summary = 0, quiet = 1; int arg_skip = 0; int ec_enabled = 0; @@ -518,6 +520,8 @@ static int main_loop(int argc, const char **argv_) { #if CONFIG_VP9_HIGHBITDEPTH unsigned int output_bit_depth = 0; #endif + int svc_decoding = 0; + int svc_spatial_layer = 0; #if CONFIG_VP8_DECODER vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 }; #endif @@ -536,6 +540,8 @@ static int main_loop(int argc, const char **argv_) { char outfile_name[PATH_MAX] = { 0 }; FILE *outfile = NULL; + FILE *framestats_file = NULL; + MD5Context md5_ctx; unsigned char md5_digest[16]; @@ -556,7 +562,10 @@ static int main_loop(int argc, const char **argv_) { memset(&arg, 0, sizeof(arg)); arg.argv_step = 1; - if (arg_match(&arg, &codecarg, argi)) { + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { interface = get_vpx_decoder_by_name(arg.val); if (!interface) die("Error: Unrecognized argument (%s) to --codec\n", arg.val); @@ -593,8 +602,9 @@ static int main_loop(int argc, const char **argv_) { else if (arg_match(&arg, &threadsarg, argi)) cfg.threads = arg_parse_uint(&arg); #if CONFIG_VP9_DECODER - else if (arg_match(&arg, &frameparallelarg, argi)) - frame_parallel = 1; + else if (arg_match(&arg, &frameparallelarg, argi)) { + /* ignored for compatibility */ + } #endif else if (arg_match(&arg, &verbosearg, argi)) quiet = 0; @@ -609,6 +619,16 @@ static int main_loop(int argc, const char **argv_) { output_bit_depth = arg_parse_uint(&arg); } #endif + else if (arg_match(&arg, &svcdecodingarg, argi)) { + svc_decoding = 1; + svc_spatial_layer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &framestatsarg, argi)) { + framestats_file = fopen(arg.val, "w"); + if (!framestats_file) { + die("Error: Could not open --framestats file (%s) for writing.\n", + arg.val); + } + } #if CONFIG_VP8_DECODER else if (arg_match(&arg, &addnoise_level, argi)) { postproc = 1; @@ -642,6 +662,7 @@ static int main_loop(int argc, const char **argv_) { if (!fn) { free(argv); + fprintf(stderr, "No input file specified!\n"); usage_exit(); } /* Open file */ @@ -717,15 +738,21 @@ static int main_loop(int argc, const char **argv_) { if (!interface) interface = get_vpx_decoder_by_index(0); dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) | - (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) | - (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0); + (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0); if (vpx_codec_dec_init(&decoder, interface->codec_interface(), &cfg, dec_flags)) { fprintf(stderr, "Failed to initialize decoder: %s\n", vpx_codec_error(&decoder)); - return EXIT_FAILURE; + goto fail2; + } + if (svc_decoding) { + if (vpx_codec_control(&decoder, VP9_DECODE_SVC_SPATIAL_LAYER, + svc_spatial_layer)) { + fprintf(stderr, "Failed to set spatial layer for svc decode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } } - if (!quiet) fprintf(stderr, "%s\n", decoder.name); #if CONFIG_VP8_DECODER @@ -733,7 +760,7 @@ static int main_loop(int argc, const char **argv_) { vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) { fprintf(stderr, "Failed to configure postproc: %s\n", vpx_codec_error(&decoder)); - return EXIT_FAILURE; + goto fail; } #endif @@ -752,13 +779,15 @@ static int main_loop(int argc, const char **argv_) { &ext_fb_list)) { fprintf(stderr, "Failed to configure external frame buffers: %s\n", vpx_codec_error(&decoder)); - return EXIT_FAILURE; + goto fail; } } frame_avail = 1; got_data = 0; + if (framestats_file) fprintf(framestats_file, "bytes,qp\n"); + /* Decode file */ while (frame_avail || got_data) { vpx_codec_iter_t iter = NULL; @@ -779,11 +808,21 @@ static int main_loop(int argc, const char **argv_) { const char *detail = vpx_codec_error_detail(&decoder); warn("Failed to decode frame %d: %s", frame_in, vpx_codec_error(&decoder)); - if (detail) warn("Additional information: %s", detail); + corrupted = 1; if (!keep_going) goto fail; } + if (framestats_file) { + int qp; + if (vpx_codec_control(&decoder, VPXD_GET_LAST_QUANTIZER, &qp)) { + warn("Failed VPXD_GET_LAST_QUANTIZER: %s", + vpx_codec_error(&decoder)); + if (!keep_going) goto fail; + } + fprintf(framestats_file, "%d,%d\n", (int)bytes_in_buffer, qp); + } + vpx_usec_timer_mark(&timer); dx_time += vpx_usec_timer_elapsed(&timer); } else { @@ -799,6 +838,8 @@ static int main_loop(int argc, const char **argv_) { // Flush the decoder in frame parallel decode. if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) { warn("Failed to flush decoder: %s", vpx_codec_error(&decoder)); + corrupted = 1; + if (!keep_going) goto fail; } } @@ -811,7 +852,7 @@ static int main_loop(int argc, const char **argv_) { vpx_usec_timer_mark(&timer); dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer); - if (!frame_parallel && + if (!corrupted && vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) { warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder)); if (!keep_going) goto fail; @@ -861,7 +902,7 @@ static int main_loop(int argc, const char **argv_) { "Scaling is disabled in this configuration. " "To enable scaling, configure with --enable-libyuv\n", vpx_codec_error(&decoder)); - return EXIT_FAILURE; + goto fail; #endif } } @@ -948,7 +989,7 @@ static int main_loop(int argc, const char **argv_) { if (do_md5) { update_image_md5(img, planes, &md5_ctx); } else { - write_image_file(img, planes, outfile); + if (!corrupted) write_image_file(img, planes, outfile); } } else { generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, @@ -972,17 +1013,21 @@ static int main_loop(int argc, const char **argv_) { fprintf(stderr, "\n"); } - if (frames_corrupted) + if (frames_corrupted) { fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted); + } else { + ret = EXIT_SUCCESS; + } fail: if (vpx_codec_destroy(&decoder)) { fprintf(stderr, "Failed to destroy decoder: %s\n", vpx_codec_error(&decoder)); - return EXIT_FAILURE; } +fail2: + if (!noblit && single_file) { if (do_md5) { MD5Final(md5_digest, &md5_ctx); @@ -1010,9 +1055,11 @@ fail: free(ext_fb_list.ext_fb); fclose(infile); + if (framestats_file) fclose(framestats_file); + free(argv); - return frames_corrupted ? EXIT_FAILURE : EXIT_SUCCESS; + return ret; } int main(int argc, const char **argv_) { diff --git a/libs/libvpx/vpxenc.c b/libs/libvpx/vpxenc.c index 6e0af57a42..4db7eccc35 100644 --- a/libs/libvpx/vpxenc.c +++ b/libs/libvpx/vpxenc.c @@ -123,6 +123,8 @@ static int fourcc_is_ivf(const char detect[4]) { return 0; } +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t debugmode = ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"); static const arg_def_t outputfile = @@ -199,7 +201,8 @@ static const arg_def_t test16bitinternalarg = ARG_DEF( NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer"); #endif -static const arg_def_t *main_args[] = { &debugmode, +static const arg_def_t *main_args[] = { &help, + &debugmode, &outputfile, &codecarg, &passes, @@ -321,8 +324,11 @@ static const arg_def_t minsection_pct = ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"); static const arg_def_t maxsection_pct = ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"); -static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct, - &maxsection_pct, NULL }; +static const arg_def_t corpus_complexity = + ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint"); +static const arg_def_t *rc_twopass_args[] = { + &bias_pct, &minsection_pct, &maxsection_pct, &corpus_complexity, NULL +}; static const arg_def_t kf_min_dist = ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"); @@ -355,6 +361,8 @@ static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"); static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); +static const arg_def_t gf_cbr_boost_pct = ARG_DEF( + NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); #if CONFIG_VP8_ENCODER static const arg_def_t cpu_used_vp8 = @@ -363,12 +371,21 @@ static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2"); static const arg_def_t screen_content_mode = ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode"); -static const arg_def_t *vp8_args[] = { - &cpu_used_vp8, &auto_altref, &noise_sens, &sharpness, - &static_thresh, &token_parts, &arnr_maxframes, &arnr_strength, - &arnr_type, &tune_ssim, &cq_level, &max_intra_rate_pct, - &screen_content_mode, NULL -}; +static const arg_def_t *vp8_args[] = { &cpu_used_vp8, + &auto_altref, + &noise_sens, + &sharpness, + &static_thresh, + &token_parts, + &arnr_maxframes, + &arnr_strength, + &arnr_type, + &tune_ssim, + &cq_level, + &max_intra_rate_pct, + &gf_cbr_boost_pct, + &screen_content_mode, + NULL }; static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF, VP8E_SET_NOISE_SENSITIVITY, @@ -381,6 +398,7 @@ static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT, + VP8E_SET_GF_CBR_BOOST_PCT, VP8E_SET_SCREEN_CONTENT_MODE, 0 }; #endif @@ -407,8 +425,6 @@ static const arg_def_t alt_ref_aq = ARG_DEF(NULL, "alt-ref-aq", 1, static const arg_def_t frame_periodic_boost = ARG_DEF(NULL, "frame-boost", 1, "Enable frame periodic boost (0: off (default), 1: on)"); -static const arg_def_t gf_cbr_boost_pct = ARG_DEF( - NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); static const arg_def_t max_inter_rate_pct = ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"); static const arg_def_t min_gf_interval = ARG_DEF( @@ -431,8 +447,8 @@ static const struct arg_enum_list color_space_enum[] = { }; static const arg_def_t input_color_space = - ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:", - color_space_enum); + ARG_DEF_ENUM(NULL, "color-space", 1, + "The color space of input content:", color_space_enum); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -450,6 +466,7 @@ static const arg_def_t inbitdeptharg = static const struct arg_enum_list tune_content_enum[] = { { "default", VP9E_CONTENT_DEFAULT }, { "screen", VP9E_CONTENT_SCREEN }, + { "film", VP9E_CONTENT_FILM }, { NULL, 0 } }; @@ -458,8 +475,18 @@ static const arg_def_t tune_content = ARG_DEF_ENUM( static const arg_def_t target_level = ARG_DEF( NULL, "target-level", 1, - "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;" - " 11: level 1.1; ... 62: level 6.2)"); + "Target level\n" + " 255: off (default)\n" + " 0: only keep level stats\n" + " 1: adaptively set alt-ref " + "distance and column tile limit based on picture size, and keep" + " level stats\n" + " 10: level 1.0 11: level 1.1 " + "... 62: level 6.2"); + +static const arg_def_t row_mt = + ARG_DEF(NULL, "row-mt", 1, + "Enable row based non-deterministic multi-threading in VP9"); #endif #if CONFIG_VP9_ENCODER @@ -488,6 +515,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9, &min_gf_interval, &max_gf_interval, &target_level, + &row_mt, #if CONFIG_VP9_HIGHBITDEPTH &bitdeptharg, &inbitdeptharg, @@ -518,51 +546,60 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL, VP9E_SET_TARGET_LEVEL, + VP9E_SET_ROW_MT, 0 }; #endif static const arg_def_t *no_args[] = { NULL }; -void usage_exit(void) { +void show_help(FILE *fout, int shorthelp) { int i; const int num_encoder = get_vpx_encoder_count(); - fprintf(stderr, "Usage: %s -o dst_filename src_filename \n", + fprintf(fout, "Usage: %s -o dst_filename src_filename \n", exec_name); - fprintf(stderr, "\nOptions:\n"); - arg_show_usage(stderr, main_args); - fprintf(stderr, "\nEncoder Global Options:\n"); - arg_show_usage(stderr, global_args); - fprintf(stderr, "\nRate Control Options:\n"); - arg_show_usage(stderr, rc_args); - fprintf(stderr, "\nTwopass Rate Control Options:\n"); - arg_show_usage(stderr, rc_twopass_args); - fprintf(stderr, "\nKeyframe Placement Options:\n"); - arg_show_usage(stderr, kf_args); + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "\nOptions:\n"); + arg_show_usage(fout, main_args); + fprintf(fout, "\nEncoder Global Options:\n"); + arg_show_usage(fout, global_args); + fprintf(fout, "\nRate Control Options:\n"); + arg_show_usage(fout, rc_args); + fprintf(fout, "\nTwopass Rate Control Options:\n"); + arg_show_usage(fout, rc_twopass_args); + fprintf(fout, "\nKeyframe Placement Options:\n"); + arg_show_usage(fout, kf_args); #if CONFIG_VP8_ENCODER - fprintf(stderr, "\nVP8 Specific Options:\n"); - arg_show_usage(stderr, vp8_args); + fprintf(fout, "\nVP8 Specific Options:\n"); + arg_show_usage(fout, vp8_args); #endif #if CONFIG_VP9_ENCODER - fprintf(stderr, "\nVP9 Specific Options:\n"); - arg_show_usage(stderr, vp9_args); + fprintf(fout, "\nVP9 Specific Options:\n"); + arg_show_usage(fout, vp9_args); #endif - fprintf(stderr, + fprintf(fout, "\nStream timebase (--timebase):\n" " The desired precision of timestamps in the output, expressed\n" " in fractional seconds. Default is 1/1000.\n"); - fprintf(stderr, "\nIncluded encoders:\n\n"); + fprintf(fout, "\nIncluded encoders:\n\n"); for (i = 0; i < num_encoder; ++i) { const VpxInterface *const encoder = get_vpx_encoder_by_index(i); const char *defstr = (i == (num_encoder - 1)) ? "(default)" : ""; - fprintf(stderr, " %-6s - %s %s\n", encoder->name, + fprintf(fout, " %-6s - %s %s\n", encoder->name, vpx_codec_iface_name(encoder->codec_interface()), defstr); } - fprintf(stderr, "\n "); - fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n"); + fprintf(fout, "\n "); + fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n"); +} +void usage_exit(void) { + show_help(stderr, 1); exit(EXIT_FAILURE); } @@ -877,7 +914,10 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; - if (arg_match(&arg, &codecarg, argi)) { + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { global->codec = get_vpx_encoder_by_name(arg.val); if (!global->codec) die("Error: Unrecognized argument (%s) to --codec\n", arg.val); @@ -1211,6 +1251,11 @@ static int parse_stream_params(struct VpxEncoderConfig *global, } else if (arg_match(&arg, &maxsection_pct, argi)) { config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg); + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &corpus_complexity, argi)) { + config->cfg.rc_2pass_vbr_corpus_complexity = arg_parse_uint(&arg); + if (global->passes < 2) warn("option %s ignored in one-pass mode.\n", arg.name); } else if (arg_match(&arg, &kf_min_dist, argi)) { @@ -1409,6 +1454,7 @@ static void show_stream_config(struct stream_state *stream, SHOW(rc_2pass_vbr_bias_pct); SHOW(rc_2pass_vbr_minsection_pct); SHOW(rc_2pass_vbr_maxsection_pct); + SHOW(rc_2pass_vbr_corpus_complexity); SHOW(kf_mode); SHOW(kf_min_dist); SHOW(kf_max_dist); @@ -1647,7 +1693,7 @@ static void get_cx_data(struct stream_state *stream, *got_data = 0; while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) { static size_t fsize = 0; - static int64_t ivf_header_pos = 0; + static FileOffset ivf_header_pos = 0; switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: @@ -1673,7 +1719,7 @@ static void get_cx_data(struct stream_state *stream, fsize += pkt->data.frame.sz; if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) { - const int64_t currpos = ftello(stream->file); + const FileOffset currpos = ftello(stream->file); fseeko(stream->file, ivf_header_pos, SEEK_SET); ivf_write_frame_size(stream->file, fsize); fseeko(stream->file, currpos, SEEK_SET); @@ -1873,8 +1919,6 @@ int main(int argc, const char **argv_) { memset(&input, 0, sizeof(input)); exec_name = argv_[0]; - if (argc < 3) usage_exit(); - /* Setup default input stream settings */ input.framerate.numerator = 30; input.framerate.denominator = 1; @@ -1888,6 +1932,8 @@ int main(int argc, const char **argv_) { argv = argv_dup(argc - 1, argv_ + 1); parse_global_config(&global, argv); + if (argc < 3) usage_exit(); + switch (global.color_type) { case I420: input.fmt = VPX_IMG_FMT_I420; break; case I422: input.fmt = VPX_IMG_FMT_I422; break; @@ -1921,7 +1967,10 @@ int main(int argc, const char **argv_) { /* Handle non-option arguments */ input.filename = argv[0]; - if (!input.filename) usage_exit(); + if (!input.filename) { + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } /* Decide if other chroma subsamplings than 4:2:0 are supported */ if (global.codec->fourcc == VP9_FOURCC) input.only_i420 = 0; diff --git a/libs/libvpx/webmdec.cc b/libs/libvpx/webmdec.cc index ed4bd700dd..d609075a93 100644 --- a/libs/libvpx/webmdec.cc +++ b/libs/libvpx/webmdec.cc @@ -165,10 +165,11 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, } if (get_new_block) { block = block_entry->GetBlock(); + if (block == NULL) return -1; webm_ctx->block_frame_index = 0; } - } while (block->GetTrackNumber() != webm_ctx->video_track_index || - block_entry_eos); + } while (block_entry_eos || + block->GetTrackNumber() != webm_ctx->video_track_index); webm_ctx->cluster = cluster; webm_ctx->block_entry = block_entry; diff --git a/libs/libvpx/y4menc.c b/libs/libvpx/y4menc.c index e26fcaf6ea..05018dbc43 100644 --- a/libs/libvpx/y4menc.c +++ b/libs/libvpx/y4menc.c @@ -19,9 +19,9 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height, case 8: color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" - : fmt == VPX_IMG_FMT_I444 ? "C444\n" : fmt == VPX_IMG_FMT_I422 - ? "C422\n" - : "C420jpeg\n"; + : fmt == VPX_IMG_FMT_I444 + ? "C444\n" + : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n"; break; case 9: color = fmt == VPX_IMG_FMT_I44416 diff --git a/libs/libvpx/y4minput.c b/libs/libvpx/y4minput.c index acf7d69fe9..1de636cc0b 100644 --- a/libs/libvpx/y4minput.c +++ b/libs/libvpx/y4minput.c @@ -195,26 +195,29 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, window.*/ for (x = 0; x < OC_MINI(_c_w, 2); x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + - 35 * _src[OC_MINI(x + 1, _c_w - 1)] - - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + - _src[OC_MINI(x + 3, _c_w - 1)] + 64) >> - 7, + 0, + (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] + + 64) >> + 7, 255); } for (; x < _c_w - 3; x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + - 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> - 7, + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> + 7, 255); } for (; x < _c_w; x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + - 35 * _src[OC_MINI(x + 1, _c_w - 1)] - - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> - 7, + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> + 7, 255); } _dst += _c_w; @@ -314,28 +317,31 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, for (x = 0; x < c_w; x++) { for (y = 0; y < OC_MINI(c_h, 3); y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + - 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + - 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> - 7, + 0, + (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> + 7, 255); } for (; y < c_h - 2; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + - 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> - 7, + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> + 7, 255); } for (; y < c_h; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + - 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + - 4 * tmp[(c_h - 1) * c_w] + 64) >> - 7, + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[(c_h - 1) * c_w] + 64) >> + 7, 255); } _dst++; @@ -361,10 +367,11 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, } for (; y < c_h - 3; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + - 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - - 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> - 7, + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - + 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> + 7, 255); } for (; y < c_h; y++) { @@ -404,18 +411,20 @@ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, for (x = 0; x < _c_w; x++) { for (y = 0; y < OC_MINI(_c_h, 2); y += 2) { _dst[(y >> 1) * _c_w] = - OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - - 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + - 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> - 7, + OC_CLAMPI(0, + (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - + 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> + 7, 255); } for (; y < _c_h - 3; y += 2) { _dst[(y >> 1) * _c_w] = - OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - - 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + - 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> - 7, + OC_CLAMPI(0, + (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> + 7, 255); } for (; y < _c_h; y += 2) { @@ -642,33 +651,38 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, 4-tap Mitchell window.*/ for (x = 0; x < OC_MINI(c_w, 1); x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI( - 0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - - _aux[OC_MINI(2, c_w - 1)] + 64) >> - 7, + 0, + (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - + _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, 255); tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( - 0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - - 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> - 7, + 0, + (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - + 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, 255); } for (; x < c_w - 2; x++) { tmp[x << 1] = - (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] + - 18 * _aux[x + 1] - _aux[x + 2] + 64) >> - 7, + (unsigned char)OC_CLAMPI(0, + (_aux[x - 1] + 110 * _aux[x] + + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> + 7, 255); tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( - 0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - - 5 * _aux[x + 2] + 64) >> - 7, + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - + 5 * _aux[x + 2] + 64) >> + 7, 255); } for (; x < c_w; x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI( - 0, (_aux[x - 1] + 110 * _aux[x] + - 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >> - 7, + 0, + (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - + _aux[c_w - 1] + 64) >> + 7, 255); if ((x << 1 | 1) < dst_c_w) { tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( @@ -718,27 +732,29 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ for (y = 0; y < c_h; y++) { for (x = 0; x < OC_MINI(c_w, 2); x += 2) { - tmp[x >> 1] = - OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - - 17 * _aux[OC_MINI(2, c_w - 1)] + - 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> - 7, - 255); + tmp[x >> 1] = OC_CLAMPI(0, + (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - + 17 * _aux[OC_MINI(2, c_w - 1)] + + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> + 7, + 255); } for (; x < c_w - 3; x += 2) { - tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) - - 17 * (_aux[x - 1] + _aux[x + 2]) + - 78 * (_aux[x] + _aux[x + 1]) + 64) >> - 7, + tmp[x >> 1] = OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[x + 3]) - + 17 * (_aux[x - 1] + _aux[x + 2]) + + 78 * (_aux[x] + _aux[x + 1]) + 64) >> + 7, 255); } for (; x < c_w; x += 2) { - tmp[x >> 1] = OC_CLAMPI( - 0, (3 * (_aux[x - 2] + _aux[c_w - 1]) - - 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + - 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> - 7, - 255); + tmp[x >> 1] = + OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[c_w - 1]) - + 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + + 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> + 7, + 255); } tmp += dst_c_w; _aux += c_w;